Skip to content

Instantly share code, notes, and snippets.

@jin
Created May 24, 2015 13:12
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jin/4288b140b1e0d0fb6191 to your computer and use it in GitHub Desktop.
Save jin/4288b140b1e0d0fb6191 to your computer and use it in GitHub Desktop.
require 'nokogiri'
require 'httparty'
require 'concurrent'
require 'pp'
class HWZRequest
def initialize
@host = "http://forums.hardwarezone.com.sg"
end
def get_thread_paths(path)
doc = Nokogiri::HTML(HTTParty.get(@host + '/' + path))
thread_list_xpath = '//*[contains(@id, "thread_title_")]'
doc.xpath(thread_list_xpath).map do |element|
element.attributes['href'].value
end
end
def get_comments(thread, page = 1, max_pages = nil, comments = [])
url = @host + '/' + thread
if page > 1
splitted = url.split(".")
splitted.pop
splitted[-1] += "-#{page}"
url = splitted.push("html").join(".")
end
begin
doc = Nokogiri::HTML(HTTParty.get(url))
rescue SocketError => e
puts "Hit with SocketError #{e}, skipping.."
return comments
end
max_pages = max_pages_count(doc) if max_pages.nil?
puts "SUCCESS: Thread: #{thread} - pulled page #{page}/#{max_pages}"
comments_xpath = '//*[contains(@id, "post_message_")]'
c = doc.xpath(comments_xpath).map do |element|
comment = element
.children
.first
.content
.gsub(/\r|\t|\n/, " ")
.gsub(/\s+/, " ")
.strip
# Remove '$username wrote: { ... }'
splitted = comment.split(":")
splitted.count > 1 ? splitted[1..-1].join(":") : splitted.first
end
comments.push(*c)
if page == max_pages
puts "comment count: #{comments.count}"
comments
else
get_comments(thread, page + 1, max_pages, comments)
end
end
private
def max_pages_count(doc)
max_pages_xpath = '//div[@class="pagination"]'
result = doc.xpath(max_pages_xpath).first
if result.nil?
1
else
result
.children[1]
.children
.first
.content
.scan(/(\d+)[^\d]*$/)[0][0]
.to_i
end
end
end
class HWZScraper
def self.scrape_thread_paths(forum_path)
req = HWZRequest.new
resp = req.get_thread_paths(forum_path)
save!('threads.txt', resp)
resp
end
def self.scrape_thread_comments(thread_path)
req = HWZRequest.new
resp = req.get_comments(thread_path).compact.map(&:strip).uniq
save!('comment.txt', resp)
end
def self.save!(filepath, data)
File.open(filepath, 'a') do |f|
data.each { |e| f.puts e }
end
end
end
# forums = {
# edmw: 'eat-drink-man-woman-16'
# }
# threads = [] << HWZScraper.scrape_thread_paths(forums[:edmw])
# 786.upto(905) do |i|
# HWZScraper.scrape_thread_paths(forums[:edmw] + "/index#{i}.html")
# p "pulled #{i}"
# end
f = File.read('unique_threads.txt')
threads = f.split("\n")
pool = Concurrent::FixedThreadPool.new(5)
threads.each_with_index do |t, idx|
puts "Posting job #{idx} to poll"
pool.post do
HWZScraper.scrape_thread_comments(t)
puts "SUCCESS: Thread #{idx}/#{threads.count}"
end
end
pool.shutdown
pool.wait_for_termination
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment