jin/hwz.rb

## hwz.rb
require 'nokogiri'
require 'httparty'
require 'concurrent'
require 'pp'

class HWZRequest

  def initialize
    @host = "http://forums.hardwarezone.com.sg"
  end

  def get_thread_paths(path)
    doc = Nokogiri::HTML(HTTParty.get(@host + '/' + path))
    thread_list_xpath = '//*[contains(@id, "thread_title_")]'
    doc.xpath(thread_list_xpath).map do |element|
      element.attributes['href'].value
    end
  end

  def get_comments(thread, page = 1, max_pages = nil, comments = [])
    url = @host + '/' + thread
    if page > 1
      splitted = url.split(".")
      splitted.pop
      splitted[-1] += "-#{page}"
      url = splitted.push("html").join(".")
    end
    begin
      doc = Nokogiri::HTML(HTTParty.get(url))
    rescue SocketError => e
      puts "Hit with SocketError #{e}, skipping.."
      return comments
    end
    max_pages = max_pages_count(doc) if max_pages.nil?
    puts "SUCCESS: Thread: #{thread} - pulled page #{page}/#{max_pages}"

    comments_xpath = '//*[contains(@id, "post_message_")]'
    c = doc.xpath(comments_xpath).map do |element|
      comment = element
        .children
        .first
        .content
        .gsub(/\r|\t|\n/, " ")
        .gsub(/\s+/, " ")
        .strip

      # Remove '$username wrote: { ... }'
      splitted = comment.split(":")
      splitted.count > 1 ? splitted[1..-1].join(":") : splitted.first
    end
    comments.push(*c)

    if page == max_pages
      puts "comment count: #{comments.count}"
      comments
    else
      get_comments(thread, page + 1, max_pages, comments)
    end
  end

  private

  def max_pages_count(doc)
    max_pages_xpath = '//div[@class="pagination"]'
    result = doc.xpath(max_pages_xpath).first

    if result.nil?
      1
    else
      result
        .children[1]
        .children
        .first
        .content
        .scan(/(\d+)[^\d]*$/)[0][0]
        .to_i
    end
  end

end

class HWZScraper

  def self.scrape_thread_paths(forum_path)
    req = HWZRequest.new
    resp = req.get_thread_paths(forum_path)
    save!('threads.txt', resp)
    resp
  end

  def self.scrape_thread_comments(thread_path)
    req = HWZRequest.new
    resp = req.get_comments(thread_path).compact.map(&:strip).uniq
    save!('comment.txt', resp)
  end

  def self.save!(filepath, data)
    File.open(filepath, 'a') do |f|
      data.each { |e| f.puts e }
    end
  end

end

# forums = {
#   edmw: 'eat-drink-man-woman-16'
# }

# threads = [] << HWZScraper.scrape_thread_paths(forums[:edmw])
# 786.upto(905) do |i|
#   HWZScraper.scrape_thread_paths(forums[:edmw] + "/index#{i}.html")
#   p "pulled #{i}"
# end

f = File.read('unique_threads.txt')
threads = f.split("\n")

pool = Concurrent::FixedThreadPool.new(5)

threads.each_with_index do |t, idx|
  puts "Posting job #{idx} to poll"
  pool.post do
    HWZScraper.scrape_thread_comments(t)
    puts "SUCCESS: Thread #{idx}/#{threads.count}"
  end
end

pool.shutdown
pool.wait_for_termination
	require 'nokogiri'
	require 'httparty'
	require 'concurrent'
	require 'pp'

	class HWZRequest

	def initialize
	@host = "http://forums.hardwarezone.com.sg"
	end

	def get_thread_paths(path)
	doc = Nokogiri::HTML(HTTParty.get(@host + '/' + path))
	thread_list_xpath = '//*[contains(@id, "thread_title_")]'
	doc.xpath(thread_list_xpath).map do \|element\|
	element.attributes['href'].value
	end
	end

	def get_comments(thread, page = 1, max_pages = nil, comments = [])
	url = @host + '/' + thread
	if page > 1
	splitted = url.split(".")
	splitted.pop
	splitted[-1] += "-#{page}"
	url = splitted.push("html").join(".")
	end
	begin
	doc = Nokogiri::HTML(HTTParty.get(url))
	rescue SocketError => e
	puts "Hit with SocketError #{e}, skipping.."
	return comments
	end
	max_pages = max_pages_count(doc) if max_pages.nil?
	puts "SUCCESS: Thread: #{thread} - pulled page #{page}/#{max_pages}"

	comments_xpath = '//*[contains(@id, "post_message_")]'
	c = doc.xpath(comments_xpath).map do \|element\|
	comment = element
	.children
	.first
	.content
	.gsub(/\r\|\t\|\n/, " ")
	.gsub(/\s+/, " ")
	.strip

	# Remove '$username wrote: { ... }'
	splitted = comment.split(":")
	splitted.count > 1 ? splitted[1..-1].join(":") : splitted.first
	end
	comments.push(*c)

	if page == max_pages
	puts "comment count: #{comments.count}"
	comments
	else
	get_comments(thread, page + 1, max_pages, comments)
	end
	end

	private

	def max_pages_count(doc)
	max_pages_xpath = '//div[@class="pagination"]'
	result = doc.xpath(max_pages_xpath).first

	if result.nil?
	1
	else
	result
	.children[1]
	.children
	.first
	.content
	.scan(/(\d+)[^\d]*$/)[0][0]
	.to_i
	end
	end

	end

	class HWZScraper

	def self.scrape_thread_paths(forum_path)
	req = HWZRequest.new
	resp = req.get_thread_paths(forum_path)
	save!('threads.txt', resp)
	resp
	end

	def self.scrape_thread_comments(thread_path)
	req = HWZRequest.new
	resp = req.get_comments(thread_path).compact.map(&:strip).uniq
	save!('comment.txt', resp)
	end

	def self.save!(filepath, data)
	File.open(filepath, 'a') do \|f\|
	data.each { \|e\| f.puts e }
	end
	end

	end

	# forums = {
	# edmw: 'eat-drink-man-woman-16'
	# }

	# threads = [] << HWZScraper.scrape_thread_paths(forums[:edmw])
	# 786.upto(905) do \|i\|
	# HWZScraper.scrape_thread_paths(forums[:edmw] + "/index#{i}.html")
	# p "pulled #{i}"
	# end

	f = File.read('unique_threads.txt')
	threads = f.split("\n")

	pool = Concurrent::FixedThreadPool.new(5)

	threads.each_with_index do \|t, idx\|
	puts "Posting job #{idx} to poll"
	pool.post do
	HWZScraper.scrape_thread_comments(t)
	puts "SUCCESS: Thread #{idx}/#{threads.count}"
	end
	end

	pool.shutdown
	pool.wait_for_termination