Last active
December 19, 2015 23:19
-
-
Save mluedke2/6033951 to your computer and use it in GitHub Desktop.
in progress of scraping site for Administrative Code of San Francisco at request of City Hall
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require 'rubygems' | |
require 'nokogiri' | |
require 'json' | |
require 'watir-webdriver' | |
# define a utility method we'll need later | |
def collect_between(first, last) | |
first == last ? [first] : [first, *collect_between(first.next, last)] | |
end | |
# also define a way to search html text for the section class definition | |
def is_a_section(str) | |
(str =~ /class="Section"/i) != nil | |
end | |
# also define a way to search html text for the footer div containing disclaimer | |
def is_footer(str) | |
(str =~ /American Legal Publishing Corporation provides these documents for informational purposes only./i) != nil | |
end | |
browser = Watir::Browser.new | |
browser.goto 'http://www.amlegal.com/nxt/gateway.dll?f=templates&fn=default.htm&vid=amlegal:sanfrancisco_ca' | |
frame = Nokogiri::HTML(browser.frame(:name,"contents").html) | |
first_level_container = frame.css("body")[0].css('div#California_c')[0].css('div')[3] | |
an_img = first_level_container.css('img')[0] | |
browser.frame(:name,"contents").element(css: an_img.css_path).click | |
puts "waiting 7 seconds for operation..." | |
sleep(7) | |
puts "done waiting" | |
# update after pressing the button (takes some time to load--may need to lengthen) | |
frame = Nokogiri::HTML(browser.frame(:name,"contents").html) | |
first_level_container = frame.css("body")[0].css('div#California_c')[0].css('div')[3] | |
morenode_img = first_level_container.css("[ct='application/morenode']")[0].css('img')[0] | |
browser.frame(:name,"contents").element(css: morenode_img.css_path).click | |
puts "waiting 5 seconds for operation..." | |
sleep(5) | |
puts "done waiting" | |
# update again after pressing more button | |
frame = Nokogiri::HTML(browser.frame(:name,"contents").html) | |
first_level_container = frame.css("body")[0].css('div#California_c')[0].css('div')[3] | |
chapters = first_level_container.css("[class='treenode']") | |
puts "number of elements:" | |
puts chapters.length | |
# initialize the sections and titles arrays | |
sections_array = Array.new() | |
titles_array = Array.new() | |
chapters.each_with_index do |chapter,i| | |
# within each Chapter, except the first and last which are weird... | |
if i == 0 || i == chapters.length - 1 | |
puts "skipping non-numbered chapters" | |
else | |
# find the link that needs to be clicked | |
chapter_link = chapter.css("[class='nodetext']")[0] | |
# click on that element | |
browser.frame(:name,"contents").element(css: chapter_link.css_path).click | |
#wait 2 seconds | |
puts "waiting 3 seconds for operation..." | |
sleep(3) | |
puts "done waiting" | |
#scoot over to the main window | |
doc_body = Nokogiri::HTML(browser.frame(:name,"main").frame(:name,"docbody").html) | |
# grab the chapter's title-- the first one is weird but the rest are same | |
chapter_title = doc_body.css("[class='Chapter']")[0].css('span')[0].text | |
# originally the index started at ': ' but now ':' | |
# also need to get the index of '.' because some chapters have that? | |
colon_index = chapter_title.index(':') | |
period_index = chapter_title.index('.') | |
divider_index = 0 | |
if colon_index != nil | |
if colon_index < 20 | |
divider_index = colon_index | |
end | |
else | |
divider_index = period_index | |
end | |
# get the index and text title separated | |
chapter_index = chapter_title[(chapter_title.index('R')+2)..(divider_index-1)] | |
chapter_name = chapter_title[(divider_index+2)..-1] | |
puts chapter_index | |
puts chapter_name | |
if i == chapters.length - 2 | |
chapter_index = "A" | |
end | |
# add the index and text to an array | |
chapter_array = Array.new() | |
chapter_array.push(chapter_index) | |
chapter_array.push(chapter_name) | |
# add the array to the titles_array | |
titles_array.push(chapter_array) | |
# grab any articles in it | |
articles = doc_body.css("[class='Article']") | |
# we may not need the histories? trying with disclaimer div | |
footers = doc_body.search "[text()*='American Legal Publishing Corporation provides these documents for informational purposes only.']" | |
footer = footers.last | |
articles.each_with_index do |article,j| | |
# for each article... | |
# grab histories too, for now | |
histories = article.css("[class='History']") | |
# isolate the html chunk associated with this article (last is tricky) | |
if j < articles.length - 1 | |
paragraphs = collect_between(article,articles[j+1]) | |
else | |
paragraphs = collect_between(article,footer) | |
end | |
# use our custom method to get the elements that are sections | |
sections1 = paragraphs.select { |p| is_a_section(p.to_s)} | |
sections1.each_with_index do |section1,k| | |
if i == 51 && k == 91 | |
elsif i == 51 && k == 92 | |
else | |
# for each section: | |
# grab and add the section's title | |
# some of theme are in different formats! | |
# most are in a span, but a few are just in the h5 | |
section_title = section1.text.gsub("\n"," ") | |
puts section_title | |
# grab the text inside the section. last one is tricky!! | |
if k < sections1.length - 1 | |
paragraphs1 = collect_between(section1,sections1[k+1]) | |
elsif j < articles.length - 1 | |
#replacing histories.last with footer | |
paragraphs1 = collect_between(section1,histories.last) | |
else | |
paragraphs1 = collect_between(section1,footer) | |
end | |
section_text1 = "" | |
paragraphs1.each_with_index do |paragraph1,l| | |
if l != 0 && l < paragraphs1.length - 1 #&& k < sections1.length - 1 | |
section_text1 += paragraph1 | |
# elsif l != 0 && k == sections1.length - 1 | |
# section_text1 += paragraph1 | |
end | |
end | |
# parse section_title | |
section_title_a = section_title[(section_title.index(' ')+1)..-1] | |
section_title_index = section_title_a[0...section_title_a.index(' ')] | |
section_title_text = section_title_a[(section_title_a.index(' ')+1)..-1] | |
section_title_index = section_title_index[0...-2] | |
# add section title to new array | |
section_array = Array.new() | |
section_array.push(section_title_index) | |
section_array.push(section_title_text) | |
sections_array.push(section_array) | |
# create a JSON Object for this section | |
section_object = Hash.new() | |
# text | |
section_object.merge!(:text => section_text1) | |
# credits (tag is history but it seems more credit like?) | |
# division (identifier and text). use article | |
# chapter (identifier and text). use chapter | |
# index is usually a period | |
# like 'SEC. 10.100-373.' | |
# but sometimes a colon | |
# like 'Appendix A:' | |
section_period_index = section_title_index.index('.') | |
# heading | |
section_heading = Hash.new() | |
if section_period_index != nil | |
section_heading.merge!(:title => section_title_index[0...section_period_index]) | |
section_heading.merge!(:chaptersection => section_title_index[(section_period_index+1)..-1]) | |
else | |
section_heading.merge!(:title => section_title_index) | |
end | |
section_heading.merge!(:identifier => section_title_index) | |
section_heading.merge!(:catch_text => section_title_text) | |
section_object.merge!(:heading => section_heading) | |
# create a file for this section and put in json with title_index | |
File.open("sections/" + section_title_index + ".json","w") do |f| | |
f.write(section_object.to_json) | |
end | |
end | |
end | |
end | |
# at chapter level, if no articles then grab any sections in it | |
if articles.length == 0 | |
sections2 = doc_body.css("[class='Section']") | |
sections2.each_with_index do |section2,k| | |
# for each section: | |
# grab and add the section's title | |
section_title2 = section2.css('span')[0].text | |
puts section_title2 | |
# grab the text inside the section. last one is tricky | |
if k < sections2.length - 1 | |
paragraphs = collect_between(section2,sections2[k+1]) | |
else | |
paragraphs = collect_between(section2,footer) | |
# else | |
# MAYBE TODO: need case for 29A, where the last paragraph DOESN't have history | |
# last before div????? | |
# paragraphs = collect_between(section2,histories.last) | |
end | |
section_text2 = "" | |
paragraphs.each_with_index do |paragraph,l| | |
if l != 0 && l < paragraphs.length - 1 #&& k < sections2.length - 1 | |
section_text2 += paragraph | |
# elsif l != 0 && k == sections2.length - 1 | |
# section_text2 += paragraph | |
end | |
end | |
# parse section_title | |
section_title_a = section_title2[(section_title2.index(' ')+1)..-1] | |
section_title_index = section_title_a[0...section_title_a.index(' ')] | |
section_title_text = section_title_a[(section_title_a.index(' ')+1)..-1] | |
puts "then..." | |
puts section_title_index | |
puts section_title_text | |
if i == chapters.length - 2 | |
section_title_text = section_title_text[2..-1] | |
section_title_index = "A." + section_title_index[0...-1] | |
else | |
section_title_index = section_title_index[0...-2] | |
end | |
puts "now..." | |
puts section_title_index | |
puts section_title_text | |
# add section title to new array | |
section_array = Array.new() | |
section_array.push(section_title_index) | |
section_array.push(section_title_text) | |
sections_array.push(section_array) | |
# create a JSON Object for this section | |
section_object = Hash.new() | |
# text | |
section_object.merge!(:text => section_text2) | |
# credits (tag is history but it seems more credit like?) | |
# division (identifier and text). use article | |
# chapter (identifier and text). use chapter | |
# index is usually a period | |
# like 'SEC. 10.100-373.' | |
# but sometimes a colon | |
# like 'Appendix A:' | |
section_period_index = section_title_index.index('.') | |
# heading | |
section_heading = Hash.new() | |
if section_period_index != nil | |
section_heading.merge!(:title => section_title_index[0...section_period_index]) | |
section_heading.merge!(:chaptersection => section_title_index[(section_period_index+1)..-1]) | |
else | |
section_heading.merge!(:title => section_title_index) | |
end | |
section_heading.merge!(:identifier => section_title_index) | |
section_heading.merge!(:catch_text => section_title_text) | |
section_object.merge!(:heading => section_heading) | |
# create a file for this section and put in json with title_index | |
File.open("sections/" + section_title_index + ".json","w") do |f| | |
f.write(section_object.to_json) | |
end | |
end | |
end | |
end | |
end | |
File.open("sids.json","w") do |f| | |
f.write(sections_array.to_json) | |
end | |
# add sids and titles to new index object | |
index_object = Hash.new() | |
index_object.merge!(:sections => sections_array) | |
index_object.merge!(:titles => titles_array) | |
File.open("index.json","w") do |f| | |
f.write(index_object.to_json) | |
end | |
# TODO: extra. handle the first and last chapters. they are not real chapters so their formatting is funky | |
#celebrate good times | |
#come on |
i am now moving this over to a version that does not go into mysql but rather parses everything into a format called "The State Decoded," for which there are some nice open-source browsers being built in the sf-brigade github account
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
unfortunately, this is not working completely correctly yet. it seems to be starting at the right places, creating chapters, articles, and sections appropriately. However, while it's almost impossible to do a good analysis because there's just so much text, there is at least one case where the "section" text contains the section you want, but then it just keeps running and running.
So titles are fine. Looks like just the section text. It must have something to do with the histories and footers...
Even so, this is a relatively minor bug and development of the front-end can continue regardless. Just needs to be fixed before launch.