Skip to content

Instantly share code, notes, and snippets.

@carlcrott
Created April 8, 2012 04:14
Show Gist options
  • Save carlcrott/2334580 to your computer and use it in GitHub Desktop.
Save carlcrott/2334580 to your computer and use it in GitHub Desktop.
require 'open-uri'
require 'nokogiri'
require 'mechanize'
require 'json'
__FILE__ == $0 ? ( REPO_NAME = __FILE__.split(".")[0] ) : ""
class String
def valid_json?
begin
JSON.parse(self)
return true
rescue Exception => e
return false
end
end
end
def build_json(arr)
full_array = []
# running w error handling bc its a big ass project
begin
# arr = ["Advanced Composite Materials", "http://booksandjournals.brillonline.com/content/15685519"]
if arr[1].split('/')[2] == 'booksandjournals.brillonline.com' # standard formatting
abb = arr[1].split('/')[-1]
@temp = {
"url" => "#{arr[1]}",
"rss" => "http://booksandjournals.brillonline.com/rss/content/#{abb}/latest?fmt=rss",
"index" => "#{arr[1]}"
}
else
puts "BLEEP! BLOOP! I dont know how to build this entry: #{arr}"
end
full_array = { "name" => arr[0], "url" => @temp['url'], "rss" => @temp['rss'], "index" => @temp['index'] }
rescue
full_array = "ERROR @ #{arr[1]}"
end
return full_array
end
def verify_data(entry, v = true)
begin ###### Verify url
open(entry['url']).is_a? Tempfile
rescue
puts "ERROR: Expecting '#{entry['url']}' to parse open-uri" unless entry['index'] == 'idk'
end
begin ###### Verify rss
Mechanize.new.get(entry['rss']).content.class.is_a? Nokogiri::XML::Document
rescue
if entry['rss'] != 'idk'
puts "ERROR: Expecting '#{entry['rss']}' to parse as Mechanize::File class"
entry['rss'] = 'idk'
end
end
begin ###### Verify index
page = Mechanize.new.get(entry['index'])
url_tests = []
(2008..2012).map {|x| x="[text()*='#{x}']"; url_tests << page.search(x).count}
raise "" unless url_tests.any? != 0
rescue
entry['index'] == 'idk' ? "": (puts "ERROR: Expecting '#{entry['index']}' to contain strings '2008..2012'")
end
v ? (puts "VERIFIED: #{entry}") : ""
return entry
end
def main()
journals = []
url_param = ('a'..'z').to_a << 'number'
duh = []
for i in url_param
duh << Thread.new(i) { |param|
puts "getting '#{param}' journals"
# open the first page of every parameter: ['a'..'z','number']
page = Mechanize.new.get("http://booksandjournals.brillonline.com/content/all/#{param}?perPage=100")
# p "should find %s journals" % page.search('.publistwrapper').search('p')[0].text().split(' ')[-2]
# figure out how many iterations are present within the page for that param
# FIXME: ghetto rigged
iteration_links = page.search('.paginator').search('a')
iters = []
for link in iteration_links
iters << link.text().to_i # all words will evaluate to 0
end
# loop through each iteration
# Ex: (1..4)
for param_iter in (1..iters.max)
page_iteration = Mechanize.new.get("http://booksandjournals.brillonline.com/content/all/#{param}?perPage=100&page=#{param_iter}")
links = page_iteration.search('.separated-list').search('li').search('h5').search('a')
for link in links
journals << [ link.text(), link.attributes['href'].text().split(';')[0] ]
end
end
}
duh.each {|d| d.join}
end
# after building the list of journals
# there will be duplicates >> page indexing is buggy
puts "Initial length of 'journals' var: %s" % journals.length
journals = journals.uniq
puts "After journals.uniq %s" % journals.length
topics_list = []
for journal in journals
name = journal[0]
# NOTE: each of these pages has a bibtex link
link = "http://booksandjournals.brillonline.com#{journal[1]}"
topics_list << [name, link]
end
final = []
for t in topics_list
# build_json(t)
journal_entry = verify_data(build_json(t))
final << journal_entry
end
puts "VALID JSON? #{final.to_json.valid_json?}"
output_file = "#{REPO_NAME}_output.json"
puts "Writing output to file: #{output_file}"
File.open(output_file,'a').write(final.to_json)
puts "VERIFYING... All outputs should be quiet"
for entry in final
verify_data(entry, false)
end
end
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment