Skip to content

Instantly share code, notes, and snippets.

@jdar
Created May 2, 2012 13:09
Show Gist options
  • Save jdar/2576428 to your computer and use it in GitHub Desktop.
Save jdar/2576428 to your computer and use it in GitHub Desktop.
scrape flashcards
# Rakefile
require 'open-uri'
require 'nokogiri'
require 'fileutils'
desc "scrape a url or file for table rows. lightly hard-coded for WOLD"
task :scrape, :page, :table_container_id, :header_rows do |t, args|
page = nil
page = Dir[args[:page]].first if args[:page].include?("*")
file = open(page || args[:page])
document = Nokogiri::HTML(file)
FileUtils.mkdir_p("vocab")
rows = document.search("//div[@id='#{args[:table_container_id]||"words-container"}']//tr")
(args[:header_rows].to_i||0).times { rows.shift } # get rid of header
@length = rows.length
binned = Hash.new {|h,k| h[k]=[] }
for i in (0..(@length-1))
row = rows[i].children
values = [row[0],row[4]].map{|el| el.text.strip }
values << row[6].text.to_i.to_s
bin = (i / 20).round * 20
binned[bin] << {:word=>values[1].split(/\s/).last, :parsed=>values.join("|")}
end
t = Time.now.to_i
for bin, rows in binned
word = rows[0][:word].gsub(/\W/,"")
File.open("vocab/#{t}_#{bin}_#{word}", "w+") do |f|
rows.each {|r| f.puts(r[:parsed]) }
end
end
puts "parsed: #{@length}"
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment