Created
April 8, 2012 04:14
-
-
Save carlcrott/2334580 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require 'open-uri' | |
require 'nokogiri' | |
require 'mechanize' | |
require 'json' | |
__FILE__ == $0 ? ( REPO_NAME = __FILE__.split(".")[0] ) : "" | |
class String | |
def valid_json? | |
begin | |
JSON.parse(self) | |
return true | |
rescue Exception => e | |
return false | |
end | |
end | |
end | |
def build_json(arr) | |
full_array = [] | |
# running w error handling bc its a big ass project | |
begin | |
# arr = ["Advanced Composite Materials", "http://booksandjournals.brillonline.com/content/15685519"] | |
if arr[1].split('/')[2] == 'booksandjournals.brillonline.com' # standard formatting | |
abb = arr[1].split('/')[-1] | |
@temp = { | |
"url" => "#{arr[1]}", | |
"rss" => "http://booksandjournals.brillonline.com/rss/content/#{abb}/latest?fmt=rss", | |
"index" => "#{arr[1]}" | |
} | |
else | |
puts "BLEEP! BLOOP! I dont know how to build this entry: #{arr}" | |
end | |
full_array = { "name" => arr[0], "url" => @temp['url'], "rss" => @temp['rss'], "index" => @temp['index'] } | |
rescue | |
full_array = "ERROR @ #{arr[1]}" | |
end | |
return full_array | |
end | |
def verify_data(entry, v = true) | |
begin ###### Verify url | |
open(entry['url']).is_a? Tempfile | |
rescue | |
puts "ERROR: Expecting '#{entry['url']}' to parse open-uri" unless entry['index'] == 'idk' | |
end | |
begin ###### Verify rss | |
Mechanize.new.get(entry['rss']).content.class.is_a? Nokogiri::XML::Document | |
rescue | |
if entry['rss'] != 'idk' | |
puts "ERROR: Expecting '#{entry['rss']}' to parse as Mechanize::File class" | |
entry['rss'] = 'idk' | |
end | |
end | |
begin ###### Verify index | |
page = Mechanize.new.get(entry['index']) | |
url_tests = [] | |
(2008..2012).map {|x| x="[text()*='#{x}']"; url_tests << page.search(x).count} | |
raise "" unless url_tests.any? != 0 | |
rescue | |
entry['index'] == 'idk' ? "": (puts "ERROR: Expecting '#{entry['index']}' to contain strings '2008..2012'") | |
end | |
v ? (puts "VERIFIED: #{entry}") : "" | |
return entry | |
end | |
def main() | |
journals = [] | |
url_param = ('a'..'z').to_a << 'number' | |
duh = [] | |
for i in url_param | |
duh << Thread.new(i) { |param| | |
puts "getting '#{param}' journals" | |
# open the first page of every parameter: ['a'..'z','number'] | |
page = Mechanize.new.get("http://booksandjournals.brillonline.com/content/all/#{param}?perPage=100") | |
# p "should find %s journals" % page.search('.publistwrapper').search('p')[0].text().split(' ')[-2] | |
# figure out how many iterations are present within the page for that param | |
# FIXME: ghetto rigged | |
iteration_links = page.search('.paginator').search('a') | |
iters = [] | |
for link in iteration_links | |
iters << link.text().to_i # all words will evaluate to 0 | |
end | |
# loop through each iteration | |
# Ex: (1..4) | |
for param_iter in (1..iters.max) | |
page_iteration = Mechanize.new.get("http://booksandjournals.brillonline.com/content/all/#{param}?perPage=100&page=#{param_iter}") | |
links = page_iteration.search('.separated-list').search('li').search('h5').search('a') | |
for link in links | |
journals << [ link.text(), link.attributes['href'].text().split(';')[0] ] | |
end | |
end | |
} | |
duh.each {|d| d.join} | |
end | |
# after building the list of journals | |
# there will be duplicates >> page indexing is buggy | |
puts "Initial length of 'journals' var: %s" % journals.length | |
journals = journals.uniq | |
puts "After journals.uniq %s" % journals.length | |
topics_list = [] | |
for journal in journals | |
name = journal[0] | |
# NOTE: each of these pages has a bibtex link | |
link = "http://booksandjournals.brillonline.com#{journal[1]}" | |
topics_list << [name, link] | |
end | |
final = [] | |
for t in topics_list | |
# build_json(t) | |
journal_entry = verify_data(build_json(t)) | |
final << journal_entry | |
end | |
puts "VALID JSON? #{final.to_json.valid_json?}" | |
output_file = "#{REPO_NAME}_output.json" | |
puts "Writing output to file: #{output_file}" | |
File.open(output_file,'a').write(final.to_json) | |
puts "VERIFYING... All outputs should be quiet" | |
for entry in final | |
verify_data(entry, false) | |
end | |
end | |
main() | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment