eladmeidar/fetch_logos.rb

## fetch_logos.rb

# Usage: ruby fetch_logos.rb <FULL_DOMAIN>
require 'httparty'
require 'nokogiri'
require 'css_parser'
require 'byebug'

class Array
  def self.wrap(object)
    if !object.is_a?(Array)
      [object].flatten
    else
      object
    end
  end
end

domain = ARGV[0]
html = HTTParty.get(domain).body

# Usage: ruby fetch_logos.rb http://github.com
results = []
response = Nokogiri::HTML(html)

# First try to find IMG src attributes that have the word 'logo' in them, usually indicating
# It's a logo
imgs = response.xpath("//img/@src").collect(&:value).select {|v| v.include?('logo')}
imgs.select {|link| ["jpg", "jpeg", "png", "svg"].any? {|ext| link.include?(ext)}}.each do |img|
  img = [domain, img].join unless img.include?("http")
  results << img
end

# Try to collect IMG src from elements that have 'header' or 'logo' in their CSS id or class
containers = response.xpath("//*[contains(normalize-space(@class), 'logo')]//img/@src").collect(&:value)
containers << response.xpath("//*[contains(normalize-space(@id), 'header')]//img/@src").collect(&:value)
containers << response.xpath("//*[contains(normalize-space(@id), 'logo')]//img/@src").collect(&:value)
containers << response.xpath("//*[contains(normalize-space(@class), 'header')]//img/@src").collect(&:value)
containers << response.xpath("//header//img/@src").collect(&:value)
containers.flatten.select {|link| ["jpg", "jpeg", "png", "svg"].any? {|ext| link.include?(ext)} }.each do |img|
  img = [domain, img].join unless img.include?("http")
  results << img
end

# Extract all url(*) from CSS and check for images with 'logo' in them
css_files = response.xpath('//link[@type="text/css"]/@href').collect(&:value)
css_files.each do |css_file|
  if css_file.include?("http")
    css_content = HTTParty.get("#{css_file}").body
  else
    css_content = HTTParty.get("#{domain}#{css_file}").body
  end
  parser = CssParser::Parser.new
  parser.load_string!(css_content)
  # byebug
  parser.each_selector do |selector, declarations, specificity|
    Array.wrap(declarations.match(/url\('?"?([^']+)'?"?\)/i)).compact.select {|url| url[0].downcase.include?('logo') }.each do |url|
      img = [domain, url[1]].join unless url[1].include?("http")
      results << img
    end
  end
end
puts JSON.dump(results.uniq)

	# Usage: ruby fetch_logos.rb <FULL_DOMAIN>
	require 'httparty'
	require 'nokogiri'
	require 'css_parser'
	require 'byebug'

	class Array
	def self.wrap(object)
	if !object.is_a?(Array)
	[object].flatten
	else
	object
	end
	end
	end

	domain = ARGV[0]
	html = HTTParty.get(domain).body

	# Usage: ruby fetch_logos.rb http://github.com
	results = []
	response = Nokogiri::HTML(html)

	# First try to find IMG src attributes that have the word 'logo' in them, usually indicating
	# It's a logo
	imgs = response.xpath("//img/@src").collect(&:value).select {\|v\| v.include?('logo')}
	imgs.select {\|link\| ["jpg", "jpeg", "png", "svg"].any? {\|ext\| link.include?(ext)}}.each do \|img\|
	img = [domain, img].join unless img.include?("http")
	results << img
	end

	# Try to collect IMG src from elements that have 'header' or 'logo' in their CSS id or class
	containers = response.xpath("//*[contains(normalize-space(@class), 'logo')]//img/@src").collect(&:value)
	containers << response.xpath("//*[contains(normalize-space(@id), 'header')]//img/@src").collect(&:value)
	containers << response.xpath("//*[contains(normalize-space(@id), 'logo')]//img/@src").collect(&:value)
	containers << response.xpath("//*[contains(normalize-space(@class), 'header')]//img/@src").collect(&:value)
	containers << response.xpath("//header//img/@src").collect(&:value)
	containers.flatten.select {\|link\| ["jpg", "jpeg", "png", "svg"].any? {\|ext\| link.include?(ext)} }.each do \|img\|
	img = [domain, img].join unless img.include?("http")
	results << img
	end

	# Extract all url(*) from CSS and check for images with 'logo' in them
	css_files = response.xpath('//link[@type="text/css"]/@href').collect(&:value)
	css_files.each do \|css_file\|
	if css_file.include?("http")
	css_content = HTTParty.get("#{css_file}").body
	else
	css_content = HTTParty.get("#{domain}#{css_file}").body
	end
	parser = CssParser::Parser.new
	parser.load_string!(css_content)
	# byebug
	parser.each_selector do \|selector, declarations, specificity\|
	Array.wrap(declarations.match(/url\('?"?([^']+)'?"?\)/i)).compact.select {\|url\| url[0].downcase.include?('logo') }.each do \|url\|
	img = [domain, url[1]].join unless url[1].include?("http")
	results << img
	end
	end
	end
	puts JSON.dump(results.uniq)