ptrin/scrape.rb

## scrape.rb
require "fileutils"
require "httparty"
require "json"
require "nokogiri"
require "open-uri"

DOMAIN = "http://www.warplane.com/"

# strip domain from URL
def strip_domain(url)
  url.gsub(DOMAIN,'')
end

# setup planes array of hashes
def get_planes_array(page)
  page.css(".div-button a").reject { |link|
    link["href"] == "#"
  }.map{ |link|
    {
      "link" => link["href"],
      "thumbnail" => strip_domain(link.css("img")[0]["src"]),
      "name" => link.css("b")[0].text,
    }
  }
end

# get Nokogiri page obj of a plane's detail page
def get_detail_page(plane)
    Nokogiri::HTML(open(plane["link"]))
end

# download gallery photos for specific plane and get filename info
def get_gallery_photos(detail_page)
  begin
    images = detail_page.css("table[id*=PhotosDataList] a").map { |link|

      new_image = {
        "src" => strip_domain(link["href"]),
        "description" => link["title"]
      }

      # some images have no thumbnail, so we test against a regex
      # which captures the contents of the inline background image
      if link.css("div")[0]["style"] =~ /\((.*)\)/
        new_image["thumbnail"] = strip_domain($1)
      end

      new_image

    }

    images

  rescue => e
    puts e.message
  end
end

# get detailed plane information and add it to
def get_plane_specs(detail_page)
  # get sidebar
  sidebar = detail_page.css("td[style]").last

  specs = {}

  spec_strings = sidebar.to_s.split("<br>").select { |str| str.match /<b>/ }
  spec_strings.each do |s|
    matches = s.match /<b>(.+):<\/b>(.*)/
    specs[matches[1].downcase.gsub(" ","_")] = matches[2].strip
  end

  specs

end

def download_plane_image(path)
  localpath = File.expand_path path
  FileUtils.mkdir_p File.dirname(localpath)
  File.open(localpath, "wb") do |f|
    puts "Downloading #{DOMAIN}#{path}"
    f.write HTTParty.get(DOMAIN+path).parsed_response
  end
end

# writes json file of plane info
def write_json_file(planes)
  File.open("planes.json","wb") { |f| f.write(JSON.pretty_generate(planes))}
  puts "Wrote planes to file"
end

# "main"
page = Nokogiri::HTML(open("http://www.warplane.com/warplane-vintage-aircraft-collection.aspx"))
planes = get_planes_array(page)

planes.each do |plane|

  detail_page = get_detail_page plane

  plane["images"] = get_gallery_photos detail_page
  plane["specs"] = get_plane_specs detail_page

  thread_list = []
  plane["images"].each do |image|
    thread_list << Thread.new {
      download_plane_image(plane["thumbnail"])

      download_plane_image(image["src"])
      if image["thumbnail"]
        download_plane_image(image["thumbnail"])
      end
    }
  end
  thread_list.each {|x| x.join}
end

write_json_file planes
	require "fileutils"
	require "httparty"
	require "json"
	require "nokogiri"
	require "open-uri"

	DOMAIN = "http://www.warplane.com/"

	# strip domain from URL
	def strip_domain(url)
	url.gsub(DOMAIN,'')
	end

	# setup planes array of hashes
	def get_planes_array(page)
	page.css(".div-button a").reject { \|link\|
	link["href"] == "#"
	}.map{ \|link\|
	{
	"link" => link["href"],
	"thumbnail" => strip_domain(link.css("img")[0]["src"]),
	"name" => link.css("b")[0].text,
	}
	}
	end

	# get Nokogiri page obj of a plane's detail page
	def get_detail_page(plane)
	Nokogiri::HTML(open(plane["link"]))
	end

	# download gallery photos for specific plane and get filename info
	def get_gallery_photos(detail_page)
	begin
	images = detail_page.css("table[id*=PhotosDataList] a").map { \|link\|

	new_image = {
	"src" => strip_domain(link["href"]),
	"description" => link["title"]
	}

	# some images have no thumbnail, so we test against a regex
	# which captures the contents of the inline background image
	if link.css("div")[0]["style"] =~ /\((.*)\)/
	new_image["thumbnail"] = strip_domain($1)
	end

	new_image

	}

	images

	rescue => e
	puts e.message
	end
	end

	# get detailed plane information and add it to
	def get_plane_specs(detail_page)
	# get sidebar
	sidebar = detail_page.css("td[style]").last

	specs = {}

	spec_strings = sidebar.to_s.split("<br>").select { \|str\| str.match /<b>/ }
	spec_strings.each do \|s\|
	matches = s.match /<b>(.+):<\/b>(.*)/
	specs[matches[1].downcase.gsub(" ","_")] = matches[2].strip
	end

	specs

	end

	def download_plane_image(path)
	localpath = File.expand_path path
	FileUtils.mkdir_p File.dirname(localpath)
	File.open(localpath, "wb") do \|f\|
	puts "Downloading #{DOMAIN}#{path}"
	f.write HTTParty.get(DOMAIN+path).parsed_response
	end
	end

	# writes json file of plane info
	def write_json_file(planes)
	File.open("planes.json","wb") { \|f\| f.write(JSON.pretty_generate(planes))}
	puts "Wrote planes to file"
	end

	# "main"
	page = Nokogiri::HTML(open("http://www.warplane.com/warplane-vintage-aircraft-collection.aspx"))
	planes = get_planes_array(page)

	planes.each do \|plane\|

	detail_page = get_detail_page plane

	plane["images"] = get_gallery_photos detail_page
	plane["specs"] = get_plane_specs detail_page

	thread_list = []
	plane["images"].each do \|image\|
	thread_list << Thread.new {
	download_plane_image(plane["thumbnail"])

	download_plane_image(image["src"])
	if image["thumbnail"]
	download_plane_image(image["thumbnail"])
	end
	}
	end
	thread_list.each {\|x\| x.join}
	end

	write_json_file planes