Skip to content

Instantly share code, notes, and snippets.

@faucct
Last active August 29, 2015 14:04
Show Gist options
  • Save faucct/c2608cbac966502f0504 to your computer and use it in GitHub Desktop.
Save faucct/c2608cbac966502f0504 to your computer and use it in GitHub Desktop.
parser examples
namespace :fragrantica do
require 'nokogiri'
require 'open-uri'
task :brands => :environment do
1.upto(11).each do |i|
brands_doc = Nokogiri::HTML(open("http://www.fragrantica.com/designers-#{i}/"))
brands_doc.css("div#col1 div.nduList a").each do |brand_anchor|
brand_name = brand_anchor.content[1..-2]
brand = Brand.where(name: brand_name).first_or_create do |brand|
puts "parsing brand '#{brand_name}'"
brand_doc = Nokogiri::HTML(open("http://www.fragrantica.com#{brand_anchor['href']}"))
brand.description = parse_brand_description(brand_doc)
brand.logo = parse_brand_logo(brand_doc)
brand.niche = parse_brand_niche(brand_doc)
brand.link = parse_brand_link(brand_doc)
brand.parent_company_id = parse_brand_parent_company_id(brand_doc)
brand.country_id = parse_brand_country_id(brand_doc)
brand.industry_id = parse_brand_industry_id(brand_doc)
end
end
end
end
def parse_brand_logo(brand_doc)
brand_logo_url = brand_doc.at_css("#col1 > div > img")['src']
original_logo_url = 'http://fimgs.net/images/dizajneri/o.' + brand_logo_url.split("/")[-1].split('.')[1..-1].join('.')
begin
logo = URI.parse(brand_logo_url)
brand = Brand.new(logo: logo)
adapter = Paperclip.io_adapters.for(brand.logo)
unless Paperclip::MediaTypeSpoofDetector.using(adapter, brand.logo.original_filename).spoofed?
logo
end
rescue OpenURI::HTTPError
nil
end
end
def parse_brand_country_id(brand_doc)
brand_doc.css('#col1 > div > p > a').each do |anchor|
if anchor['href'].include?('country')
return BrandCountry.where(name: anchor.content).first_or_create.id
end
end
nil
end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment