Skip to content

Instantly share code, notes, and snippets.

@gouf

gouf/scraping.rb Secret

Created September 26, 2014 04:06
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save gouf/ec880c381d4b606b522e to your computer and use it in GitHub Desktop.
Save gouf/ec880c381d4b606b522e to your computer and use it in GitHub Desktop.
require 'kconv'
require 'open-uri'
require 'mechanize'
require 'csv'
MAIL_ADDRESS_REGEXP = /[\w\d_-]+@[\w\d_-]+\.[\w\d._-]+/.freeze
def base_url
'http://xxxxxxxxxxxxx/'
end
def open_page(url)
html = open(url, 'r:euc-jp')
nokogiri.html(html, nil, 'euc-jp')
end
def pref_url(num)
# 47都道府県に対応したページインデックス
page_index =
case num
when 1, 2
1
when 3..8
2
when 9..15
3
when 16..25
4
when 26..31
5
when 32..36
6
when 37..40
7
when 41..47
8
end
"#{base_url}#{page_index}&pref_id=#{num}"
end
def next_page(page)
return [] if page.search('span+ a').empty?
next_30_url = page.search('span+ a').first[:href]
url = "http://www.akabou.ne.jp/#{next_30_url}"
open_page(url)
end
def validate_mail_address(mail_address)
valid_address = MAIL_ADDRESS_REGEXP
valid_address =~ mail_address
end
def extract_company_info(element, page, mail_address)
opts = []
if validate_mail_address(mail_address)
opts << 'tr:nth-child(1) a'
opts << '.grid-7 tr:nth-child(2) td'
opts << '.grid-7 tr:nth-child(3) td'
# @company.push(company_page.search('.grid-7 tr:nth-child(4) td').text)
# @company.push(company_page.search(':content("ファックス")+ td').text)
else
opts << 'tr:nth-child(3) a'
opts << 'tr:nth-child(4) .standard'
opts << 'tr:nth-child(6) .standard'
# @company.push(company_page.search('td td tr:nth-child(8) .standard').text)
# @company.push(company_page.search('tr:nth-child(10) .standard').text)
end
# 保存に必要なデータを取得。
# TODO: 名前を付けてあげる
attr1 = element.text.toutf8
attr2 = page.search(opts.shift).to_s.scan(MAIL_ADDRESS_REGEXP)[1]
attr3 = page.search(opts.shift).text
attr4 = page.search(opts.shift).text
# attr5 =
# attr6 =
[attr1, attr2, attr3, attr4]
end
# agent = Mechanize.new
def scrape_to_csv
header = %w(会社名 メールアドレス 代表者名 所在地 tel fax)
CSV.open('xxx.csv', 'w', headers: header, write_headers: true) do |csv|
# 47都道府県
1.upto(47) do |num|
url = pref_url(num)
page = open_page(url)
# 1ページに30社
# 個別会社ページの要素を1つずつ取得
page_elements = page.search('.mini2+ .mini2 a')
page_elements.each do |company_search|
# sleep(1)
company_url = company_search[:href]
company_page = open_page(company_url)
# tr:nth-child(1)もしくはtr:nth-child(3)のcss3にメールアドレス
mail_address = company_page.search('tr:nth-child(1) a').text
# daihyousya, syozaichi, tel, fax
company_info =
extract_company_info(company_search, company_page, mail_address)
csv << company_info unless company_info.size.zero?
end
# 次のページに移動
page = next_page(page)
end
end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment