mtsuszycki/html_grep.rb

## html_grep.rb
#!/usr/bin/ruby

# examples
# ./htmlgrep.rb "input#gbv"  "http://www.google.co.uk"
# ./htmlgrep.rb "div#gbv"  "http://www.google.co.uk"
# htmlgrep.rb "[@class='gbm']"  "http://www.google.co.uk"
# ./htmlgrep.rb "[@class='gbmc']/ol/li"  "http://www.google.co.uk"
# ~/htmlgrep.rb '.postblockcat_whitesquare/a' HERE\ -\ Nokia\ Conversations.html | grep -Eo '<a href[^>]+'   |  sed 's/title=/,/
# g; s/<a href=//g;' > HERE\ -\ Nokia\ Conversations.csv

require 'rubygems'
require 'hpricot'
require 'open-uri'

file = open(ARGV[1]) if ARGV[1]
file ||= STDIN
doc = Hpricot(file)
puts doc / ARGV[0]
	#!/usr/bin/ruby

	# examples
	# ./htmlgrep.rb "input#gbv" "http://www.google.co.uk"
	# ./htmlgrep.rb "div#gbv" "http://www.google.co.uk"
	# htmlgrep.rb "[@class='gbm']" "http://www.google.co.uk"
	# ./htmlgrep.rb "[@class='gbmc']/ol/li" "http://www.google.co.uk"
	# ~/htmlgrep.rb '.postblockcat_whitesquare/a' HERE\ -\ Nokia\ Conversations.html \| grep -Eo '<a href[^>]+' \| sed 's/title=/,/
	# g; s/<a href=//g;' > HERE\ -\ Nokia\ Conversations.csv

	require 'rubygems'
	require 'hpricot'
	require 'open-uri'

	file = open(ARGV[1]) if ARGV[1]
	file \|\|= STDIN
	doc = Hpricot(file)
	puts doc / ARGV[0]