pdkl95/at-scrape.rb

## at-scrape.rb
#!/bin/env ruby
# encoding: utf-8
#
# *** [WARNING]
# *** Be aware that I have not added any throttling, respect
# *** of robots.txt, or anything else to limit this script!
# *** If it gets you banninated, or something due to obvious
# *** bot-like behavior, that's your responsibility!
# *** [/warning]
#
# REQUIREMENTS:
#     dadadodo - http://www.jwz.org/dadadodo/
#     the ruby  gem 'nokogiri' - run "gem install nokogiri" if needed
#
# TO GENERATE DERP, RUN:
#    ruby at-scrape.rb $(ruby at-scrape.rb) | dadadodo -
#
# (yes, that runs the script twice. Without any command line
# arguments, it scrapes the front page for article URLs. When given
# urless arguments, it scrapes those pages for the article text.)
#
# NOTES:
#   - You should probably cache the article output, instead of
#     re-scraping the site:
#        ruby at-scrape.rb $(ruby at-scrape.rb) > at_cache.txt
#        cat at_cache.txt | dadadodo -
#
#   - It's probably trivial to modify this for other sites, by
#     simply modifying the three constants below.

MAINPAGE               = "http://www.americanthinker.com"
MAINPAGE_LINK_SELECTOR = "a.home_title"
ARTICLE_SELECTOR       = ".article_body"

require 'nokogiri'
require 'open-uri'

def getpage(url)
  doc = Nokogiri::HTML(open(url))
  doc.css(ARTICLE_SELECTOR).each do |article|
    puts article.content
  end
end


def scrape_mainpage
  doc = Nokogiri::HTML(open(MAINPAGE))
  doc.css(MAINPAGE_LINK_SELECTOR).each do |link|
    href = link.attribute('href')
    case href
    when %r{#{MAINPAGE}/articles/}, %r{#{MAINPAGE}/blog/}
      puts href
    when %r{#{MAINPAGE}/video/}
      # skip, probably less useful text
      STDERR.puts "skip: #{href}"
    else
      STDERR.puts "Unknown URL: #{href}"
    end
  end
end

if ARGV.length > 0
  ARGV.each do |url|
    getpage url
  end
else
  scrape_mainpage
end
	#!/bin/env ruby
	# encoding: utf-8
	#
	# *** [WARNING]
	# *** Be aware that I have not added any throttling, respect
	# *** of robots.txt, or anything else to limit this script!
	# *** If it gets you banninated, or something due to obvious
	# *** bot-like behavior, that's your responsibility!
	# *** [/warning]
	#
	# REQUIREMENTS:
	# dadadodo - http://www.jwz.org/dadadodo/
	# the ruby gem 'nokogiri' - run "gem install nokogiri" if needed
	#
	# TO GENERATE DERP, RUN:
	# ruby at-scrape.rb $(ruby at-scrape.rb) \| dadadodo -
	#
	# (yes, that runs the script twice. Without any command line
	# arguments, it scrapes the front page for article URLs. When given
	# urless arguments, it scrapes those pages for the article text.)
	#
	# NOTES:
	# - You should probably cache the article output, instead of
	# re-scraping the site:
	# ruby at-scrape.rb $(ruby at-scrape.rb) > at_cache.txt
	# cat at_cache.txt \| dadadodo -
	#
	# - It's probably trivial to modify this for other sites, by
	# simply modifying the three constants below.

	MAINPAGE = "http://www.americanthinker.com"
	MAINPAGE_LINK_SELECTOR = "a.home_title"
	ARTICLE_SELECTOR = ".article_body"

	require 'nokogiri'
	require 'open-uri'

	def getpage(url)
	doc = Nokogiri::HTML(open(url))
	doc.css(ARTICLE_SELECTOR).each do \|article\|
	puts article.content
	end
	end


	def scrape_mainpage
	doc = Nokogiri::HTML(open(MAINPAGE))
	doc.css(MAINPAGE_LINK_SELECTOR).each do \|link\|
	href = link.attribute('href')
	case href
	when %r{#{MAINPAGE}/articles/}, %r{#{MAINPAGE}/blog/}
	puts href
	when %r{#{MAINPAGE}/video/}
	# skip, probably less useful text
	STDERR.puts "skip: #{href}"
	else
	STDERR.puts "Unknown URL: #{href}"
	end
	end
	end

	if ARGV.length > 0
	ARGV.each do \|url\|
	getpage url
	end
	else
	scrape_mainpage
	end