Skip to content

Instantly share code, notes, and snippets.

@mufid
Created July 20, 2015 02:07
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save mufid/c1763e6847ace84c94bc to your computer and use it in GitHub Desktop.
Save mufid/c1763e6847ace84c94bc to your computer and use it in GitHub Desktop.
g_rbt_asia_fetcher.rb
class GRbtAsia < Fetcher
BASE_URL = "https://rbt.asia/g"
PAGE_TEMPLATE = "https://rbt.asia/g/g?task=page&page=%{page}"
THREAD_TEMPLATE = "https://rbt.asia/g/thread/%{thread}"
def fetch_page page_num
puts "Start fetch page #{page_num}"
html = RestClient.get(PAGE_TEMPLATE % {page: page_num})
doc = Nokogiri::HTML(html)
puts "End fetch page #{page_num}. Prepping fetch thread..."
# BETTER THREAD POOL PLEASE : /
threads = []
# Get the threads id. We will download each of the thread
# separately
doms = doc.css("div[id^=p]")
previous = PostThread.count
previous_post = Post.count
doms.each do |d|
thread_id = d.attribute("id").value.match(/[\d]+/)[0]
t = Thread.new(thread_id) do |t_id|
puts "Start fetch #{t_id}"
give_ups = 3
while (give_ups > 0)
begin
fetch_thread t_id
give_ups = 0
rescue RestClient::InternalServerError
puts "Error happened when try to fetch #{thread_id}. Give up in #{give_ups} more time"
give_ups -= 1
puts "Giving up fetch #{thread_id}!" if give_ups == 0
end
end
puts "End fetch #{t_id}"
end
threads << t
end
# Later: use paralelization in manager, we just
# return fetch result status along with thread ids
# N parallel thread!!!
threads.each { |t| t.join }
puts "Fetched new #{PostThread.count - previous} threads, #{Post.count - previous_post} posts"
end
def fetch_thread thread_id
html = RestClient.get(THREAD_TEMPLATE % {thread: thread_id})
doc = Nokogiri.HTML(html)
op_el = doc.css("div[id^=p]").first
thread = PostThread.where(id: thread_id.to_i).first_or_initialize
thread.subject = op_el.css("span.filetitle").first.content unless op_el.css("span.filetitle").empty?
op = post_process op_el
# thread.posts.delete_all # Why?
thread.posts << op
responds = doc.css("td[id^=p]")
responds.each do |r|
post = post_process r
thread.posts << post unless post.nil?
end
thread.save
end
def post_process element
id = element.attribute("id").value.match(/[\d]+/)[0].to_i
# Consider use above for performance (single query instead of 2)
post = Post.where(id: id).first_or_initialize
inline = element.css("img.inline")
if not inline.empty? and inline.first.attribute("alt").value == "[INTERNAL]"
puts " --- WARNING: Ignoring entry at #{id} because of modified, non-original content"
return nil
end
if element.css("> span").empty?
post.media_id = nil
post.media_filename = nil
post.media_original_url = nil
else
media_el = element.css("img.thumb")
post.media_filename = element.css("> span").first.content.match(/File: .+, .+, (.+)$/)[1]
post.media_original_url = media_el.attribute("src").value
post.media_id = post.media_original_url.match(/\/(.+)$/)[0]
end
post.content = FetchUtil.sanitize element.css("blockquote")
post.name = element.css("span.postername").first.content.strip
post.timestamp = FetchUtil.to_date element, "span.posttime", "title"
post
end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment