Skip to content

Instantly share code, notes, and snippets.

@gertig
Created November 12, 2011 15:38
Show Gist options
  • Save gertig/1360703 to your computer and use it in GitHub Desktop.
Save gertig/1360703 to your computer and use it in GitHub Desktop.
grab top stories from hacker news (supports pagination)
require 'rubygems'
require 'nokogiri'
require 'open-uri'
def is_numeric_or_short(str)
return str.length <= 5
end
url = "http://news.ycombinator.com"
doc = Nokogiri::HTML(open(url))
puts doc.at_css("title").text
next_page = ""
max_pagination = 50
delay_time = 0.5 # seconds to wait between each page cache
stories = [] # Point_Count||Title||Link||Comment_URL
max_pagination.times { |page|
if page > 1
url = "http://news.ycombinator.com" + next_page
doc = Nokogiri::HTML(open(url))
end
# trying not to look like a crawler
sleep(delay_time)
# grab point_count
i = 1 + (30*page)
doc.css("td .subtext span").each_with_index do |item|
if item[:id].include? "score_"
point_count = item.text.gsub("points", "").strip!
if !stories[i]
stories[i] = "#{point_count}||"
else
stories[i] << "#{point_count}||"
end
i += 1
end
end
# grab titles + link
i = 1 + (30*page)
doc.css("td .title a").each_with_index do |item|
if item.text == "More"
next_page = item[:href]
end
title = item.text unless is_numeric_or_short(item.text)
link = item[:href] unless is_numeric_or_short(item.text)
if !stories[i]
stories[i] = "#{title}||#{link}" unless !title
else
stories[i] << "#{title}||#{link}" unless !title
end
i += 1
end
# grab comment_count + link
i = 1 + (30*page)
doc.css("td .subtext a").each do |item|
if item[:href].include? "item?id="
# pg says comments are sometimes not included to optimize for speed; so i'm removing comment_count
# comment_count = (item.text.include?"discuss") ? "0" : item.text.gsub("comments", "").gsub("comment", "").gsub(" ", "")
comment_url = item[:href]
if !stories[i]
stories[i] = "||#{comment_url}"
else
stories[i] << "||#{comment_url}"
end
i += 1
end
end
# status
puts "Page #{page} complete..."
}
# save to file
file = File.new("stories.txt", "w")
stories.each_with_index do |item,i|
file.puts "#{i}. #{item}"
end
file.close
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment