Skip to content

Instantly share code, notes, and snippets.

@davetroy
Created August 29, 2016 03:05
Show Gist options
  • Save davetroy/a3e56682218e6e8cf37c13d9f75a2234 to your computer and use it in GitHub Desktop.
Save davetroy/a3e56682218e6e8cf37c13d9f75a2234 to your computer and use it in GitHub Desktop.
require 'rubygems'
require 'mechanize'
require 'google_drive'
agent = Mechanize.new
@base_url = "https://www.baltimorecitibuy.org"
page = agent.get("#{@base_url}/bso/external/publicContracts.sdo")
page.css('td[class="inputs-01"][align="center"]').first.text[/ of (\d+)/]
item_count = $1.to_i
puts "item 1 of #{item_count}"
page_num = 1
session = GoogleDrive::Session.from_config("config.json")
title = "Baltimore City Purchasing"
spreadsheet = session.spreadsheet_by_title(title) || session.create_spreadsheet(title)
sheet_name = Time.now.strftime("%Y-%m-%d")
if @worksheet = spreadsheet.worksheet_by_title(sheet_name)
@worksheet.delete
end
@worksheet = spreadsheet.add_worksheet(sheet_name)
headings = page.css('td[class="listHeading"]').map { |cell| cell.text }
p headings
@worksheet.update_cells(1, 1, [headings])
@worksheet.save
def extract_po(url)
agent = Mechanize.new
page = agent.get(url)
results = page.css('table[id="resultsSubtable2"]')
# headings = results.css('td[class="listHeading"]').map { |cell| cell.text }
results.css('td[class="tableText-01"]').map { |cell| cell.text.strip }
end
def extract_rows(page)
rows = page.css('tr[class^="tableStripe"]').map do |row|
po_url = nil
cells = row.css('td').map do |cell|
text = cell.text.strip
links = cell.css('a[href]')
if links.any?
url = @base_url + links.first['href']
text = "=HYPERLINK(\"#{url}\", \"#{text}\")"
po_url ||= url
end
text
end
po_info = extract_po(po_url)
cells + po_info
end
exit if rows.count == 0
rows
end
def record_page(page)
rows = extract_rows(page)
first_row = @worksheet.num_rows+1
@worksheet.update_cells(first_row, 1, rows)
@worksheet.save
end
def next_page(page, page_num)
form = page.forms[0]
form['mode'] ="navigation"
form['currentPage'] = page_num
form.submit
end
record_page(page)
while true do
page_num += 1
puts page_num
page = next_page(page, page_num)
record_page(page)
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment