Skip to content

Instantly share code, notes, and snippets.

@bartolsthoorn
Last active August 29, 2015 14:10
Show Gist options
  • Save bartolsthoorn/4b1b71b133c8282a0ff5 to your computer and use it in GitHub Desktop.
Save bartolsthoorn/4b1b71b133c8282a0ff5 to your computer and use it in GitHub Desktop.
Scraper using Simhash with DOM trees. This is a proof of concept that scrapes Hackernews with just 3 lines (34-36) of configuration.
wget https://news.ycombinator.com
ruby scraper.rb
require 'nokogiri'
require 'simhash'
# Nokogiri sugar
class Nokogiri::XML::Node
def to_outline
children.find_all(&:element?).map(&:to_outline).join
end
end
class Nokogiri::XML::Element
def to_outline
"<#{name}>#{super}</#{name}>"
end
end
module Nokogiri::XML
def self.remove_markup_outline(str)
f = Nokogiri::XML.fragment(str)
['b', 'strong', 'strike', 'i', 'u', 'img'].each do |s|
f.search('.//' + s).remove
end
f.to_outline
end
end
beginning = Time.now
puts 'Opening index.html'
document = Nokogiri::HTML(open('index.html'))
filters = {
title: 'td.title a',
comments: /(\d+) comments/,
points: /(\d+) points/,
}
# Reorder DOM trees by simhash
trees = []
document.search('//div','//tr').each do |tree|
structure = Nokogiri::XML.remove_markup_outline(tree.inner_html).strip
if structure.empty?
tree.remove # This tree is useless
next
end
trees << {
simhash: structure.simhash,
node: tree,
index: tree.xpath('count(preceding-sibling::*)').to_i
}
end
trees.sort_by! { |tree| tree[:simhash] }
puts "#{trees.count} DOM trees collected and sorted"
# Group trees by high similarity
groups = []
group = []
last_tree = trees[0]
trees.each do |tree|
hamming_distance = (tree[:simhash] ^ last_tree[:simhash]).to_s(2).count('1')
similarity = 1 - (hamming_distance / 64.0)
if similarity < 0.6
groups << {trees: group}
group = []
end
group << tree
last_tree = tree
end
groups << {trees: group}
groups.reject! { |c| c.empty? }
puts "#{groups.count} groups"
# Score the groups with the filters
groups.each do |group|
group[:score] = 0
group[:trees].each do |tree|
filters.each do |key, filter|
if filter.class == String
value = tree[:node].at(filter)
if value
tree[:item] ||= {}
tree[:item][key] = value.text
end
end
if filter.class == Regexp
match = filter.match(tree[:node].inner_html)
if match && match.captures
tree[:item] ||= {}
tree[:item][key] = match.captures[0]
end
end
end
group[:score] += tree[:item].to_h.count
end
group[:items] = group[:trees].map{|tree| tree[:item] }
.reject{|c| c.nil?}.count
group[:indices] = group[:trees].map{|tree| tree[:index]}.sort
end
groups.sort_by! { |group| group[:score] }
good_groups = groups.select { |group|
(group[:score]/group[:items].to_f >= 1.0) &&
(group[:items] > 1)
}
puts "\nGroups stats"
puts groups.map{|x|
{
trees: x[:trees].count,
score: x[:score],
items: x[:items],
indices: x[:indices]
}
}
puts "Found #{good_groups.count} good groups"
best_group = groups[-1]
# If multiple good groups are found, it's most likely because a single data
# item does NOT have a single parent HTML element. This means a single
# result is spread out over two or more DOM trees.
# Test if the groups interleave to join the trees back together as if they had
# a single parent element.
# TODO: This method is crazy
# it currently only handles interleaving indices that differ by only 1
if good_groups.count == 2
# Alternative: a.zip(b).flatten == a.zip(b).flatten.sort
interleaved = good_groups[0][:indices].zip(good_groups[1][:indices])
.each_cons(2)
.all?{|i,j| i.first <= i.last && i.last <= j.first && j.first <= j.last}
if interleaved
puts 'WARNING: Interleaving.. a risky business'
# Start interleaving with the group that has the lowest index
gg = good_groups.sort_by{|group| group[:indices][0]}
gg[0][:trees].each do |tree|
# Find accompanying tree
i = tree[:index]
tree[:item].merge!(
gg[1][:trees].select{|other_tree| other_tree[:index] == i+1 }[0][:item]
)
end
best_group = gg[0]
end
end
puts "\nBest group items"
best_group[:trees].each do |tree|
puts "index: #{tree[:index]}"
tree[:item].each do |key, value|
puts "#{key}:\t#{value}"
end
puts ''
end
puts "Found #{best_group[:trees].count} items"
puts "Time elapsed #{Time.now - beginning} seconds"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment