gouf/scraping.rb Secret

## scraping.rb
require 'kconv'
require 'open-uri'
require 'mechanize'
require 'csv'

MAIL_ADDRESS_REGEXP = /[\w\d_-]+@[\w\d_-]+\.[\w\d._-]+/.freeze

def base_url
  'http://xxxxxxxxxxxxx/'
end

def open_page(url)
  html = open(url, 'r:euc-jp')

  nokogiri.html(html, nil, 'euc-jp')
end

def pref_url(num)
  # 47都道府県に対応したページインデックス
  page_index =
    case num
    when 1, 2
      1
    when 3..8
      2
    when 9..15
      3
    when 16..25
      4
    when 26..31
      5
    when 32..36
      6
    when 37..40
      7
    when 41..47
      8
    end
  "#{base_url}#{page_index}&pref_id=#{num}"
end

def next_page(page)
  return [] if page.search('span+ a').empty?

  next_30_url = page.search('span+ a').first[:href]
  url = "http://www.akabou.ne.jp/#{next_30_url}"
  open_page(url)
end

def validate_mail_address(mail_address)
  valid_address = MAIL_ADDRESS_REGEXP
  valid_address =~ mail_address
end

def extract_company_info(element, page, mail_address)
  opts = []
  if validate_mail_address(mail_address)
    opts << 'tr:nth-child(1) a'
    opts << '.grid-7 tr:nth-child(2) td'
    opts << '.grid-7 tr:nth-child(3) td'
    # @company.push(company_page.search('.grid-7 tr:nth-child(4) td').text)
    # @company.push(company_page.search(':content("ファックス")+ td').text)
  else
    opts << 'tr:nth-child(3) a'
    opts << 'tr:nth-child(4) .standard'
    opts << 'tr:nth-child(6) .standard'
    # @company.push(company_page.search('td td tr:nth-child(8) .standard').text)
    # @company.push(company_page.search('tr:nth-child(10) .standard').text)
  end

  # 保存に必要なデータを取得。
  # TODO: 名前を付けてあげる
  attr1 = element.text.toutf8
  attr2 = page.search(opts.shift).to_s.scan(MAIL_ADDRESS_REGEXP)[1]
  attr3 = page.search(opts.shift).text
  attr4 = page.search(opts.shift).text
  # attr5 =
  # attr6 =

  [attr1, attr2, attr3, attr4]
end

# agent = Mechanize.new
def scrape_to_csv
  header = %w(会社名 メールアドレス 代表者名 所在地 tel fax)
  CSV.open('xxx.csv', 'w', headers: header, write_headers: true) do |csv|
    # 47都道府県
    1.upto(47) do |num|

      url = pref_url(num)
      page = open_page(url)

      # 1ページに30社
      # 個別会社ページの要素を1つずつ取得
      page_elements = page.search('.mini2+ .mini2 a')
      page_elements.each do |company_search|
        # sleep(1)
        company_url  = company_search[:href]
        company_page = open_page(company_url)

        # tr:nth-child(1)もしくはtr:nth-child(3)のcss3にメールアドレス
        mail_address = company_page.search('tr:nth-child(1) a').text

        # daihyousya, syozaichi, tel, fax
        company_info =
          extract_company_info(company_search, company_page, mail_address)
        csv << company_info unless company_info.size.zero?
      end
      # 次のページに移動
      page = next_page(page)
    end
  end
end
	require 'kconv'
	require 'open-uri'
	require 'mechanize'
	require 'csv'

	MAIL_ADDRESS_REGEXP = /[\w\d_-]+@[\w\d_-]+\.[\w\d._-]+/.freeze

	def base_url
	'http://xxxxxxxxxxxxx/'
	end

	def open_page(url)
	html = open(url, 'r:euc-jp')

	nokogiri.html(html, nil, 'euc-jp')
	end

	def pref_url(num)
	# 47都道府県に対応したページインデックス
	page_index =
	case num
	when 1, 2
	1
	when 3..8
	2
	when 9..15
	3
	when 16..25
	4
	when 26..31
	5
	when 32..36
	6
	when 37..40
	7
	when 41..47
	8
	end
	"#{base_url}#{page_index}&pref_id=#{num}"
	end

	def next_page(page)
	return [] if page.search('span+ a').empty?

	next_30_url = page.search('span+ a').first[:href]
	url = "http://www.akabou.ne.jp/#{next_30_url}"
	open_page(url)
	end

	def validate_mail_address(mail_address)
	valid_address = MAIL_ADDRESS_REGEXP
	valid_address =~ mail_address
	end

	def extract_company_info(element, page, mail_address)
	opts = []
	if validate_mail_address(mail_address)
	opts << 'tr:nth-child(1) a'
	opts << '.grid-7 tr:nth-child(2) td'
	opts << '.grid-7 tr:nth-child(3) td'
	# @company.push(company_page.search('.grid-7 tr:nth-child(4) td').text)
	# @company.push(company_page.search(':content("ファックス")+ td').text)
	else
	opts << 'tr:nth-child(3) a'
	opts << 'tr:nth-child(4) .standard'
	opts << 'tr:nth-child(6) .standard'
	# @company.push(company_page.search('td td tr:nth-child(8) .standard').text)
	# @company.push(company_page.search('tr:nth-child(10) .standard').text)
	end

	# 保存に必要なデータを取得。
	# TODO: 名前を付けてあげる
	attr1 = element.text.toutf8
	attr2 = page.search(opts.shift).to_s.scan(MAIL_ADDRESS_REGEXP)[1]
	attr3 = page.search(opts.shift).text
	attr4 = page.search(opts.shift).text
	# attr5 =
	# attr6 =

	[attr1, attr2, attr3, attr4]
	end

	# agent = Mechanize.new
	def scrape_to_csv
	header = %w(会社名メールアドレス代表者名所在地 tel fax)
	CSV.open('xxx.csv', 'w', headers: header, write_headers: true) do \|csv\|
	# 47都道府県
	1.upto(47) do \|num\|

	url = pref_url(num)
	page = open_page(url)

	# 1ページに30社
	# 個別会社ページの要素を1つずつ取得
	page_elements = page.search('.mini2+ .mini2 a')
	page_elements.each do \|company_search\|
	# sleep(1)
	company_url = company_search[:href]
	company_page = open_page(company_url)

	# tr:nth-child(1)もしくはtr:nth-child(3)のcss3にメールアドレス
	mail_address = company_page.search('tr:nth-child(1) a').text

	# daihyousya, syozaichi, tel, fax
	company_info =
	extract_company_info(company_search, company_page, mail_address)
	csv << company_info unless company_info.size.zero?
	end
	# 次のページに移動
	page = next_page(page)
	end
	end
	end