Created
May 23, 2015 09:58
-
-
Save abeyuya/45c40102434d96eed968 to your computer and use it in GitHub Desktop.
nokogiriでプロキシ経由でスクレイピングしたメモ
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
class ScrapingClient | |
require 'open-uri' | |
def get_proxy_host_list | |
doc = self.get_nokogiri_obj("http://lab.magicvox.net/proxy/") | |
host_port_list = [] | |
doc_list = doc.css("tr") | |
doc_list.each do |nokogiri_obj| | |
host = nokogiri_obj.css(".host") | |
port = nokogiri_obj.css(".port") | |
next if host.empty? || port.empty? | |
host_port_list.push({:host => host, :port => port}) | |
end | |
return host_port_list | |
end | |
def create_proxy_url | |
proxy_list = self.get_proxy_host_list | |
return "http://" + proxy_list[0][:host].text + ":" + proxy_list[0][:port].text | |
end | |
def get_nokogiri_obj_from_proxy(url) | |
proxy = [self.create_proxy_url, "", ""] | |
charset = nil | |
html = open(url, {:proxy_http_basic_authentication => proxy}) do |f| | |
charset = f.charset | |
f.read | |
end | |
return Nokogiri::HTML.parse(html, nil, charset) | |
end | |
def get_nokogiri_obj(url) | |
charset = nil | |
html = open(url) do |f| | |
charset = f.charset | |
f.read | |
end | |
return Nokogiri::HTML.parse(html, nil, charset) | |
end | |
end | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment