Created
November 12, 2011 15:38
-
-
Save gertig/1360703 to your computer and use it in GitHub Desktop.
grab top stories from hacker news (supports pagination)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require 'rubygems' | |
require 'nokogiri' | |
require 'open-uri' | |
def is_numeric_or_short(str) | |
return str.length <= 5 | |
end | |
url = "http://news.ycombinator.com" | |
doc = Nokogiri::HTML(open(url)) | |
puts doc.at_css("title").text | |
next_page = "" | |
max_pagination = 50 | |
delay_time = 0.5 # seconds to wait between each page cache | |
stories = [] # Point_Count||Title||Link||Comment_URL | |
max_pagination.times { |page| | |
if page > 1 | |
url = "http://news.ycombinator.com" + next_page | |
doc = Nokogiri::HTML(open(url)) | |
end | |
# trying not to look like a crawler | |
sleep(delay_time) | |
# grab point_count | |
i = 1 + (30*page) | |
doc.css("td .subtext span").each_with_index do |item| | |
if item[:id].include? "score_" | |
point_count = item.text.gsub("points", "").strip! | |
if !stories[i] | |
stories[i] = "#{point_count}||" | |
else | |
stories[i] << "#{point_count}||" | |
end | |
i += 1 | |
end | |
end | |
# grab titles + link | |
i = 1 + (30*page) | |
doc.css("td .title a").each_with_index do |item| | |
if item.text == "More" | |
next_page = item[:href] | |
end | |
title = item.text unless is_numeric_or_short(item.text) | |
link = item[:href] unless is_numeric_or_short(item.text) | |
if !stories[i] | |
stories[i] = "#{title}||#{link}" unless !title | |
else | |
stories[i] << "#{title}||#{link}" unless !title | |
end | |
i += 1 | |
end | |
# grab comment_count + link | |
i = 1 + (30*page) | |
doc.css("td .subtext a").each do |item| | |
if item[:href].include? "item?id=" | |
# pg says comments are sometimes not included to optimize for speed; so i'm removing comment_count | |
# comment_count = (item.text.include?"discuss") ? "0" : item.text.gsub("comments", "").gsub("comment", "").gsub(" ", "") | |
comment_url = item[:href] | |
if !stories[i] | |
stories[i] = "||#{comment_url}" | |
else | |
stories[i] << "||#{comment_url}" | |
end | |
i += 1 | |
end | |
end | |
# status | |
puts "Page #{page} complete..." | |
} | |
# save to file | |
file = File.new("stories.txt", "w") | |
stories.each_with_index do |item,i| | |
file.puts "#{i}. #{item}" | |
end | |
file.close |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment