bartolsthoorn/scrape.sh

## scrape.sh
wget https://news.ycombinator.com
ruby scraper.rb

## scraper.rb
require 'nokogiri'
require 'simhash'

# Nokogiri sugar
class Nokogiri::XML::Node
  def to_outline
    children.find_all(&:element?).map(&:to_outline).join
  end
end

class Nokogiri::XML::Element
  def to_outline
    "<#{name}>#{super}</#{name}>"
  end
end

module Nokogiri::XML
  def self.remove_markup_outline(str)
    f = Nokogiri::XML.fragment(str)
    ['b', 'strong', 'strike', 'i', 'u', 'img'].each do |s|
      f.search('.//' + s).remove
    end
    f.to_outline
  end
end


beginning = Time.now

puts 'Opening index.html'
document = Nokogiri::HTML(open('index.html'))

filters = {
  title: 'td.title a',
  comments: /(\d+) comments/,
  points: /(\d+) points/,
}

# Reorder DOM trees by simhash
trees = []
document.search('//div','//tr').each do |tree|
  structure = Nokogiri::XML.remove_markup_outline(tree.inner_html).strip
  if structure.empty?
    tree.remove # This tree is useless
    next
  end

  trees << {
    simhash: structure.simhash,
    node: tree,
    index: tree.xpath('count(preceding-sibling::*)').to_i
  }
end
trees.sort_by! { |tree| tree[:simhash] }

puts "#{trees.count} DOM trees collected and sorted"


# Group trees by high similarity
groups = []
group = []
last_tree = trees[0]
trees.each do |tree|
  hamming_distance = (tree[:simhash] ^ last_tree[:simhash]).to_s(2).count('1')
  similarity = 1 - (hamming_distance / 64.0)
  if similarity < 0.6
    groups << {trees: group}
    group = []
  end
  group << tree
  last_tree = tree
end
groups << {trees: group}
groups.reject! { |c| c.empty? }

puts "#{groups.count} groups"


# Score the groups with the filters
groups.each do |group|
  group[:score] = 0
  group[:trees].each do |tree|
    filters.each do |key, filter|
      if filter.class == String
        value = tree[:node].at(filter)
        if value
          tree[:item] ||= {}
          tree[:item][key] = value.text
        end
      end
      if filter.class == Regexp
        match = filter.match(tree[:node].inner_html)
        if match && match.captures
          tree[:item] ||= {}
          tree[:item][key] = match.captures[0]
        end
      end
    end

    group[:score] += tree[:item].to_h.count
  end
  group[:items] = group[:trees].map{|tree| tree[:item] }
    .reject{|c| c.nil?}.count
  group[:indices] = group[:trees].map{|tree| tree[:index]}.sort
end

groups.sort_by! { |group| group[:score] }
good_groups = groups.select { |group|
  (group[:score]/group[:items].to_f >= 1.0) &&
  (group[:items] > 1)
}

puts "\nGroups stats"
puts groups.map{|x|
  {
    trees: x[:trees].count,
    score: x[:score],
    items: x[:items],
    indices: x[:indices]
  }
}
puts "Found #{good_groups.count} good groups"

best_group = groups[-1]

# If multiple good groups are found, it's most likely because a single data
# item does NOT have a single parent HTML element. This means a single
# result is spread out over two or more DOM trees.
# Test if the groups interleave to join the trees back together as if they had
# a single parent element.
# TODO: This method is crazy
#       it currently only handles interleaving indices that differ by only 1
if good_groups.count == 2
  # Alternative: a.zip(b).flatten == a.zip(b).flatten.sort
  interleaved = good_groups[0][:indices].zip(good_groups[1][:indices])
    .each_cons(2)
    .all?{|i,j| i.first <= i.last && i.last <= j.first && j.first <= j.last}

  if interleaved
    puts 'WARNING: Interleaving.. a risky business'
    # Start interleaving with the group that has the lowest index
    gg = good_groups.sort_by{|group| group[:indices][0]}
    gg[0][:trees].each do |tree|
      # Find accompanying tree
      i = tree[:index]
      tree[:item].merge!(
        gg[1][:trees].select{|other_tree| other_tree[:index] == i+1 }[0][:item]
      )
    end
    best_group = gg[0]
  end
end

puts "\nBest group items"
best_group[:trees].each do |tree|
  puts "index: #{tree[:index]}"
  tree[:item].each do |key, value|
    puts "#{key}:\t#{value}"
  end
  puts ''
end

puts "Found #{best_group[:trees].count} items"

puts "Time elapsed #{Time.now - beginning} seconds"
	require 'nokogiri'
	require 'simhash'

	# Nokogiri sugar
	class Nokogiri::XML::Node
	def to_outline
	children.find_all(&:element?).map(&:to_outline).join
	end
	end

	class Nokogiri::XML::Element
	def to_outline
	"<#{name}>#{super}</#{name}>"
	end
	end

	module Nokogiri::XML
	def self.remove_markup_outline(str)
	f = Nokogiri::XML.fragment(str)
	['b', 'strong', 'strike', 'i', 'u', 'img'].each do \|s\|
	f.search('.//' + s).remove
	end
	f.to_outline
	end
	end


	beginning = Time.now

	puts 'Opening index.html'
	document = Nokogiri::HTML(open('index.html'))

	filters = {
	title: 'td.title a',
	comments: /(\d+) comments/,
	points: /(\d+) points/,
	}

	# Reorder DOM trees by simhash
	trees = []
	document.search('//div','//tr').each do \|tree\|
	structure = Nokogiri::XML.remove_markup_outline(tree.inner_html).strip
	if structure.empty?
	tree.remove # This tree is useless
	next
	end

	trees << {
	simhash: structure.simhash,
	node: tree,
	index: tree.xpath('count(preceding-sibling::*)').to_i
	}
	end
	trees.sort_by! { \|tree\| tree[:simhash] }

	puts "#{trees.count} DOM trees collected and sorted"


	# Group trees by high similarity
	groups = []
	group = []
	last_tree = trees[0]
	trees.each do \|tree\|
	hamming_distance = (tree[:simhash] ^ last_tree[:simhash]).to_s(2).count('1')
	similarity = 1 - (hamming_distance / 64.0)
	if similarity < 0.6
	groups << {trees: group}
	group = []
	end
	group << tree
	last_tree = tree
	end
	groups << {trees: group}
	groups.reject! { \|c\| c.empty? }

	puts "#{groups.count} groups"


	# Score the groups with the filters
	groups.each do \|group\|
	group[:score] = 0
	group[:trees].each do \|tree\|
	filters.each do \|key, filter\|
	if filter.class == String
	value = tree[:node].at(filter)
	if value
	tree[:item] \|\|= {}
	tree[:item][key] = value.text
	end
	end
	if filter.class == Regexp
	match = filter.match(tree[:node].inner_html)
	if match && match.captures
	tree[:item] \|\|= {}
	tree[:item][key] = match.captures[0]
	end
	end
	end

	group[:score] += tree[:item].to_h.count
	end
	group[:items] = group[:trees].map{\|tree\| tree[:item] }
	.reject{\|c\| c.nil?}.count
	group[:indices] = group[:trees].map{\|tree\| tree[:index]}.sort
	end

	groups.sort_by! { \|group\| group[:score] }
	good_groups = groups.select { \|group\|
	(group[:score]/group[:items].to_f >= 1.0) &&
	(group[:items] > 1)
	}

	puts "\nGroups stats"
	puts groups.map{\|x\|
	{
	trees: x[:trees].count,
	score: x[:score],
	items: x[:items],
	indices: x[:indices]
	}
	}
	puts "Found #{good_groups.count} good groups"

	best_group = groups[-1]

	# If multiple good groups are found, it's most likely because a single data
	# item does NOT have a single parent HTML element. This means a single
	# result is spread out over two or more DOM trees.
	# Test if the groups interleave to join the trees back together as if they had
	# a single parent element.
	# TODO: This method is crazy
	# it currently only handles interleaving indices that differ by only 1
	if good_groups.count == 2
	# Alternative: a.zip(b).flatten == a.zip(b).flatten.sort
	interleaved = good_groups[0][:indices].zip(good_groups[1][:indices])
	.each_cons(2)
	.all?{\|i,j\| i.first <= i.last && i.last <= j.first && j.first <= j.last}

	if interleaved
	puts 'WARNING: Interleaving.. a risky business'
	# Start interleaving with the group that has the lowest index
	gg = good_groups.sort_by{\|group\| group[:indices][0]}
	gg[0][:trees].each do \|tree\|
	# Find accompanying tree
	i = tree[:index]
	tree[:item].merge!(
	gg[1][:trees].select{\|other_tree\| other_tree[:index] == i+1 }[0][:item]
	)
	end
	best_group = gg[0]
	end
	end

	puts "\nBest group items"
	best_group[:trees].each do \|tree\|
	puts "index: #{tree[:index]}"
	tree[:item].each do \|key, value\|
	puts "#{key}:\t#{value}"
	end
	puts ''
	end

	puts "Found #{best_group[:trees].count} items"

	puts "Time elapsed #{Time.now - beginning} seconds"