richardlehane/timeline.rb

## timeline.rb
#
# This script generates a CSV that can be used by Propublica's timeline-setter
# tool to make a nice timeline. It calls out to Wikipedia and Wragge's TROVE
# api to fill out the data provided by State Records NSW in the ministries.xml file.
#
#
require 'rubygems'
require 'nokogiri'
require 'net/http'
require 'date'

PROXY = 'MY_CORPORATE_FIREWALL' # You may not need these if you
PORT = 8080 # aren't accessing the web through a corporate firewall!

MONTHS = %w{January February March April May June July August September October November December}
LINK_ROOT = "http://investigator.records.nsw.gov.au/Entity.aspx?Path=\\Ministry\\"

# Retrieve table elements from wikipedia (to grab Labor and Liberal leader names)
def get_wiki_table path, query
  wiki_string = Net::HTTP::Proxy(PROXY, PORT).start('en.wikipedia.org') {|http| http.get("/wiki/" + path).body}
  wiki = Nokogiri::HTML(wiki_string)
  table = wiki.xpath("//table[tr/th#{query}]")
end

# Wikipedia has a whole page devoted to NSW Labor leaders, they are pretty easy to scrape
lab_table = get_wiki_table "Leader_of_the_Australian_Labor_Party_in_New_South_Wales", "='Party leader'"
LABOR_LEADERS = lab_table.xpath("tr[td]").collect do |row|
  full_name = row.xpath("td")[1].content
  surname = full_name.split[-1]
  premier = row.xpath("td")[4].content
  premier.empty? ? nil : surname # filter out any NSW Labor leaders that weren't premiers
end.compact!

# ... for the Libs however we have to go to the Liberal Party's page and the scraping isn't as simple
lib_table = get_wiki_table "Liberal_Party_of_Australia", "/a='New South Wales'"
row = lib_table.xpath("tr[th/a='New South Wales']")[0].next_element
LIBERAL_LEADERS = Array.new
until row.at("th")
  full_name = row.xpath("td")[0].content
  if full_name
    surname = full_name.split[-1]
    LIBERAL_LEADERS << surname
  end
  row = row.next_element
end

# for all the pre-1955 ministries, call out to Wragge's unofficial TROVE API for relevant articles
def get_articles date, premier
  premier = premier.downcase
  query = '/api/newspapers/articles/?state=nsw&format=xml&article_type=news'
  query += '&start_date=' + date
  query += '&end_date=' + (Date.parse(date) + 1).to_s
  query += '&exact=' + premier

  articles = []
  wragge_string = Net::HTTP::Proxy(PROXY, PORT).start('wraggelabs.appspot.com') {|http| http.get(query).body}

  wragge_xml = Nokogiri::XML(wragge_string)

  wragge_xml.root.xpath("results/resource").each do |resource|
    article = []
    article << resource.at('title').content
    article << resource.at('url').content
    articles << article
  end
  html = String.new
  articles = articles[0..4] if articles[5]
  articles.each do |article|
    html += '<p><a href="' + article[1] + '">' + article[0] + '</a></p>'
  end
  html
end

# start constructing our CSV for timeline-setter
output = "date,display_date,description,link,,series,html\n"

# Now go through each Ministry in SRNSW's data and add it to the CSV
xml = Nokogiri::XML(File.open('ministries.xml'))
xml.root.search('Ministry').each do |ministry|
  puts "..working"
  date = ministry.at('Start_date').content[0..9]
  # stick in the start date
  output += date + ','

  # stick in a display date
  year_int = date[0..3].to_i
  month = MONTHS[date[5..6].to_i - 1]
  day = date[8..9].to_i.to_s
  output += day + ' ' + month + ' ' + year_int.to_s + ',,'

  description = ministry.at('Ministry_title').content

  output += LINK_ROOT + ministry.at('Ministry_number').content + ',,'

  # check if we can classify our Premier according to our Wikipedia lists
  premier = description.split(/\W/)[0]
  if LABOR_LEADERS.index(premier)
    output += 'labor,'
  elsif LIBERAL_LEADERS.index(premier)
    output += 'coalition,'
  else
    output += 'other,'
  end

  # add a heading and grab TROVE articles if pre-1955
  output += '"<H1>' + description + '</H1>'
  if year_int < 1955
    articles = get_articles(date, premier)
    articles.gsub!(/"/, '""')
    output += articles
  end
  output += '"'
  output += "\n"
end

File.open("output.csv", 'w') {|file| file.write(output)}
	#
	# This script generates a CSV that can be used by Propublica's timeline-setter
	# tool to make a nice timeline. It calls out to Wikipedia and Wragge's TROVE
	# api to fill out the data provided by State Records NSW in the ministries.xml file.
	#
	#
	require 'rubygems'
	require 'nokogiri'
	require 'net/http'
	require 'date'

	PROXY = 'MY_CORPORATE_FIREWALL' # You may not need these if you
	PORT = 8080 # aren't accessing the web through a corporate firewall!

	MONTHS = %w{January February March April May June July August September October November December}
	LINK_ROOT = "http://investigator.records.nsw.gov.au/Entity.aspx?Path=\\Ministry\\"

	# Retrieve table elements from wikipedia (to grab Labor and Liberal leader names)
	def get_wiki_table path, query
	wiki_string = Net::HTTP::Proxy(PROXY, PORT).start('en.wikipedia.org') {\|http\| http.get("/wiki/" + path).body}
	wiki = Nokogiri::HTML(wiki_string)
	table = wiki.xpath("//table[tr/th#{query}]")
	end

	# Wikipedia has a whole page devoted to NSW Labor leaders, they are pretty easy to scrape
	lab_table = get_wiki_table "Leader_of_the_Australian_Labor_Party_in_New_South_Wales", "='Party leader'"
	LABOR_LEADERS = lab_table.xpath("tr[td]").collect do \|row\|
	full_name = row.xpath("td")[1].content
	surname = full_name.split[-1]
	premier = row.xpath("td")[4].content
	premier.empty? ? nil : surname # filter out any NSW Labor leaders that weren't premiers
	end.compact!

	# ... for the Libs however we have to go to the Liberal Party's page and the scraping isn't as simple
	lib_table = get_wiki_table "Liberal_Party_of_Australia", "/a='New South Wales'"
	row = lib_table.xpath("tr[th/a='New South Wales']")[0].next_element
	LIBERAL_LEADERS = Array.new
	until row.at("th")
	full_name = row.xpath("td")[0].content
	if full_name
	surname = full_name.split[-1]
	LIBERAL_LEADERS << surname
	end
	row = row.next_element
	end

	# for all the pre-1955 ministries, call out to Wragge's unofficial TROVE API for relevant articles
	def get_articles date, premier
	premier = premier.downcase
	query = '/api/newspapers/articles/?state=nsw&format=xml&article_type=news'
	query += '&start_date=' + date
	query += '&end_date=' + (Date.parse(date) + 1).to_s
	query += '&exact=' + premier

	articles = []
	wragge_string = Net::HTTP::Proxy(PROXY, PORT).start('wraggelabs.appspot.com') {\|http\| http.get(query).body}

	wragge_xml = Nokogiri::XML(wragge_string)

	wragge_xml.root.xpath("results/resource").each do \|resource\|
	article = []
	article << resource.at('title').content
	article << resource.at('url').content
	articles << article
	end
	html = String.new
	articles = articles[0..4] if articles[5]
	articles.each do \|article\|
	html += '<p><a href="' + article[1] + '">' + article[0] + '</a></p>'
	end
	html
	end

	# start constructing our CSV for timeline-setter
	output = "date,display_date,description,link,,series,html\n"

	# Now go through each Ministry in SRNSW's data and add it to the CSV
	xml = Nokogiri::XML(File.open('ministries.xml'))
	xml.root.search('Ministry').each do \|ministry\|
	puts "..working"
	date = ministry.at('Start_date').content[0..9]
	# stick in the start date
	output += date + ','

	# stick in a display date
	year_int = date[0..3].to_i
	month = MONTHS[date[5..6].to_i - 1]
	day = date[8..9].to_i.to_s
	output += day + ' ' + month + ' ' + year_int.to_s + ',,'

	description = ministry.at('Ministry_title').content

	output += LINK_ROOT + ministry.at('Ministry_number').content + ',,'

	# check if we can classify our Premier according to our Wikipedia lists
	premier = description.split(/\W/)[0]
	if LABOR_LEADERS.index(premier)
	output += 'labor,'
	elsif LIBERAL_LEADERS.index(premier)
	output += 'coalition,'
	else
	output += 'other,'
	end

	# add a heading and grab TROVE articles if pre-1955
	output += '"<H1>' + description + '</H1>'
	if year_int < 1955
	articles = get_articles(date, premier)
	articles.gsub!(/"/, '""')
	output += articles
	end
	output += '"'
	output += "\n"
	end

	File.open("output.csv", 'w') {\|file\| file.write(output)}