-
-
Save gouf/ec880c381d4b606b522e to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require 'kconv' | |
require 'open-uri' | |
require 'mechanize' | |
require 'csv' | |
MAIL_ADDRESS_REGEXP = /[\w\d_-]+@[\w\d_-]+\.[\w\d._-]+/.freeze | |
def base_url | |
'http://xxxxxxxxxxxxx/' | |
end | |
def open_page(url) | |
html = open(url, 'r:euc-jp') | |
nokogiri.html(html, nil, 'euc-jp') | |
end | |
def pref_url(num) | |
# 47都道府県に対応したページインデックス | |
page_index = | |
case num | |
when 1, 2 | |
1 | |
when 3..8 | |
2 | |
when 9..15 | |
3 | |
when 16..25 | |
4 | |
when 26..31 | |
5 | |
when 32..36 | |
6 | |
when 37..40 | |
7 | |
when 41..47 | |
8 | |
end | |
"#{base_url}#{page_index}&pref_id=#{num}" | |
end | |
def next_page(page) | |
return [] if page.search('span+ a').empty? | |
next_30_url = page.search('span+ a').first[:href] | |
url = "http://www.akabou.ne.jp/#{next_30_url}" | |
open_page(url) | |
end | |
def validate_mail_address(mail_address) | |
valid_address = MAIL_ADDRESS_REGEXP | |
valid_address =~ mail_address | |
end | |
def extract_company_info(element, page, mail_address) | |
opts = [] | |
if validate_mail_address(mail_address) | |
opts << 'tr:nth-child(1) a' | |
opts << '.grid-7 tr:nth-child(2) td' | |
opts << '.grid-7 tr:nth-child(3) td' | |
# @company.push(company_page.search('.grid-7 tr:nth-child(4) td').text) | |
# @company.push(company_page.search(':content("ファックス")+ td').text) | |
else | |
opts << 'tr:nth-child(3) a' | |
opts << 'tr:nth-child(4) .standard' | |
opts << 'tr:nth-child(6) .standard' | |
# @company.push(company_page.search('td td tr:nth-child(8) .standard').text) | |
# @company.push(company_page.search('tr:nth-child(10) .standard').text) | |
end | |
# 保存に必要なデータを取得。 | |
# TODO: 名前を付けてあげる | |
attr1 = element.text.toutf8 | |
attr2 = page.search(opts.shift).to_s.scan(MAIL_ADDRESS_REGEXP)[1] | |
attr3 = page.search(opts.shift).text | |
attr4 = page.search(opts.shift).text | |
# attr5 = | |
# attr6 = | |
[attr1, attr2, attr3, attr4] | |
end | |
# agent = Mechanize.new | |
def scrape_to_csv | |
header = %w(会社名 メールアドレス 代表者名 所在地 tel fax) | |
CSV.open('xxx.csv', 'w', headers: header, write_headers: true) do |csv| | |
# 47都道府県 | |
1.upto(47) do |num| | |
url = pref_url(num) | |
page = open_page(url) | |
# 1ページに30社 | |
# 個別会社ページの要素を1つずつ取得 | |
page_elements = page.search('.mini2+ .mini2 a') | |
page_elements.each do |company_search| | |
# sleep(1) | |
company_url = company_search[:href] | |
company_page = open_page(company_url) | |
# tr:nth-child(1)もしくはtr:nth-child(3)のcss3にメールアドレス | |
mail_address = company_page.search('tr:nth-child(1) a').text | |
# daihyousya, syozaichi, tel, fax | |
company_info = | |
extract_company_info(company_search, company_page, mail_address) | |
csv << company_info unless company_info.size.zero? | |
end | |
# 次のページに移動 | |
page = next_page(page) | |
end | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment