Created
April 22, 2015 09:17
-
-
Save Dan-Q/b137278f585128799d4d to your computer and use it in GitHub Desktop.
A tool written during the EEBO-TCP Hackathon at the Weston Library, University of Oxford, in March 2015. Extracts markup features from XML documents and produces HTML tables showing their frequency across a corpus.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env ruby | |
# | |
# Run in a directory containing any number of XML files from the EEBO-TCP project, | |
# which can be acquired via Github at https://github.com/textcreationpartnership, | |
# among other ways. It uses the Nokogiri XML parser to perform frequency counts of | |
# each of the selected features (specified in CSS3 syntax) and outputs the result | |
# to a HTML table (by default, called "output.html"). | |
# | |
# Thrown together quickly for the EEBO-TCP Hackathon at the Weston Library in Oxford | |
# in March 2015, as described at https://danq.me/2015/04/22/eebo-tcp-hackathon, this | |
# software is offered into the public domain, without any warranty or liability, and | |
# can be used, adapted, and redistributed without license for any purpose. | |
# | |
# Author: Dan Q | https://danq.me/ | |
# | |
require 'rubygems' | |
require 'nokogiri' | |
FEATURES = { | |
'letter' => 'div[type="letter"]', | |
#'to the reader' => 'div[type="to_the_reader"]', | |
'.._reader' => 'div[type$="reader"]', | |
#'translator to the reader' => 'div[type="translator_to_the_reader"]', | |
#'publisher to the reader' => 'div[type="publisher_to_the_reader"]', | |
'dedication' => 'div[type="dedication"]', | |
'preface' => 'div[type="preface"]', | |
'chapter' => 'div[type="chapter"]', | |
'book' => 'div[type="book"]', | |
'epigraph' => 'div[type="epigraph"]', | |
'illustration' => 'div[type="illustration"]', | |
'frontispiece' => 'div[type="frontispiece"]', | |
'map' => 'div[type="map"]', | |
'poem' => 'div[type="poem"]', | |
'encomium' => 'div[type="encomium"]', | |
'dramatis personi' => 'div[type="dramatis_personi"]', | |
'argument' => 'div[type="argument"]', | |
'character description' => 'div[type="character_description"]', | |
} | |
File::open('output.html', 'w') do |out| | |
out.puts <<-EOF | |
<!DOCTYPE html> | |
<html> | |
<head> | |
<meta charset=utf-8 /> | |
<title>EEBO-TCP features analysis</title> | |
<link rel="stylesheet" href="//maxcdn.bootstrapcdn.com/bootstrap/3.3.2/css/bootstrap.min.css" /> | |
<script type="text/javascript" src="//code.jquery.com/jquery-2.1.3.min.js"></script> | |
<script type="text/javascript" src="//maxcdn.bootstrapcdn.com/bootstrap/3.3.2/js/bootstrap.min.js"></script> | |
<script type="text/javascript" src="//my-script-hosting.googlecode.com/files/jquery.tablesorter.min.js"></script> | |
</head> | |
<body> | |
<table class="table table-striped table-bordered table-hover"> | |
<thead><tr> | |
<th>ID</th> | |
<th>Title</th> | |
EOF | |
FEATURES.each do |k,v| | |
out.puts "<th>#{k}</th>" | |
out.puts "<th>any #{k}</th>" | |
end | |
out.puts '</thead><tbody>' | |
def count_divs_of_type(xml, css) | |
xml.css(css).count | |
end | |
def has_divs_of_type(xml, css) | |
count_divs_of_type(xml, css) > 0 ? 1 : 0 | |
end | |
Dir::new('.').to_a.select{|f|f=~/\.xml$/}.each do |f| | |
out.puts '<tr>' | |
out.puts "<td>#{f[0..-5]}</td>" | |
xml = Nokogiri::XML(File::read(f)) | |
out.puts "<td>#{xml.css('title').first().text()}</td>" | |
FEATURES.each do |k,v| | |
out.puts "<td>#{count_divs_of_type(xml, v)}</td>" | |
out.puts "<td>#{has_divs_of_type(xml, v)}</td>" | |
end | |
out.puts '</tr>' | |
end | |
out.puts <<-EOF | |
</tbody></table> | |
<script type="text/javascript"> | |
$(function(){ | |
$("table").tablesorter(); | |
}); | |
</body></html> | |
EOF | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment