Last active
August 29, 2015 14:10
-
-
Save bartolsthoorn/4b1b71b133c8282a0ff5 to your computer and use it in GitHub Desktop.
Scraper using Simhash with DOM trees. This is a proof of concept that scrapes Hackernews with just 3 lines (34-36) of configuration.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
wget https://news.ycombinator.com | |
ruby scraper.rb |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require 'nokogiri' | |
require 'simhash' | |
# Nokogiri sugar | |
class Nokogiri::XML::Node | |
def to_outline | |
children.find_all(&:element?).map(&:to_outline).join | |
end | |
end | |
class Nokogiri::XML::Element | |
def to_outline | |
"<#{name}>#{super}</#{name}>" | |
end | |
end | |
module Nokogiri::XML | |
def self.remove_markup_outline(str) | |
f = Nokogiri::XML.fragment(str) | |
['b', 'strong', 'strike', 'i', 'u', 'img'].each do |s| | |
f.search('.//' + s).remove | |
end | |
f.to_outline | |
end | |
end | |
beginning = Time.now | |
puts 'Opening index.html' | |
document = Nokogiri::HTML(open('index.html')) | |
filters = { | |
title: 'td.title a', | |
comments: /(\d+) comments/, | |
points: /(\d+) points/, | |
} | |
# Reorder DOM trees by simhash | |
trees = [] | |
document.search('//div','//tr').each do |tree| | |
structure = Nokogiri::XML.remove_markup_outline(tree.inner_html).strip | |
if structure.empty? | |
tree.remove # This tree is useless | |
next | |
end | |
trees << { | |
simhash: structure.simhash, | |
node: tree, | |
index: tree.xpath('count(preceding-sibling::*)').to_i | |
} | |
end | |
trees.sort_by! { |tree| tree[:simhash] } | |
puts "#{trees.count} DOM trees collected and sorted" | |
# Group trees by high similarity | |
groups = [] | |
group = [] | |
last_tree = trees[0] | |
trees.each do |tree| | |
hamming_distance = (tree[:simhash] ^ last_tree[:simhash]).to_s(2).count('1') | |
similarity = 1 - (hamming_distance / 64.0) | |
if similarity < 0.6 | |
groups << {trees: group} | |
group = [] | |
end | |
group << tree | |
last_tree = tree | |
end | |
groups << {trees: group} | |
groups.reject! { |c| c.empty? } | |
puts "#{groups.count} groups" | |
# Score the groups with the filters | |
groups.each do |group| | |
group[:score] = 0 | |
group[:trees].each do |tree| | |
filters.each do |key, filter| | |
if filter.class == String | |
value = tree[:node].at(filter) | |
if value | |
tree[:item] ||= {} | |
tree[:item][key] = value.text | |
end | |
end | |
if filter.class == Regexp | |
match = filter.match(tree[:node].inner_html) | |
if match && match.captures | |
tree[:item] ||= {} | |
tree[:item][key] = match.captures[0] | |
end | |
end | |
end | |
group[:score] += tree[:item].to_h.count | |
end | |
group[:items] = group[:trees].map{|tree| tree[:item] } | |
.reject{|c| c.nil?}.count | |
group[:indices] = group[:trees].map{|tree| tree[:index]}.sort | |
end | |
groups.sort_by! { |group| group[:score] } | |
good_groups = groups.select { |group| | |
(group[:score]/group[:items].to_f >= 1.0) && | |
(group[:items] > 1) | |
} | |
puts "\nGroups stats" | |
puts groups.map{|x| | |
{ | |
trees: x[:trees].count, | |
score: x[:score], | |
items: x[:items], | |
indices: x[:indices] | |
} | |
} | |
puts "Found #{good_groups.count} good groups" | |
best_group = groups[-1] | |
# If multiple good groups are found, it's most likely because a single data | |
# item does NOT have a single parent HTML element. This means a single | |
# result is spread out over two or more DOM trees. | |
# Test if the groups interleave to join the trees back together as if they had | |
# a single parent element. | |
# TODO: This method is crazy | |
# it currently only handles interleaving indices that differ by only 1 | |
if good_groups.count == 2 | |
# Alternative: a.zip(b).flatten == a.zip(b).flatten.sort | |
interleaved = good_groups[0][:indices].zip(good_groups[1][:indices]) | |
.each_cons(2) | |
.all?{|i,j| i.first <= i.last && i.last <= j.first && j.first <= j.last} | |
if interleaved | |
puts 'WARNING: Interleaving.. a risky business' | |
# Start interleaving with the group that has the lowest index | |
gg = good_groups.sort_by{|group| group[:indices][0]} | |
gg[0][:trees].each do |tree| | |
# Find accompanying tree | |
i = tree[:index] | |
tree[:item].merge!( | |
gg[1][:trees].select{|other_tree| other_tree[:index] == i+1 }[0][:item] | |
) | |
end | |
best_group = gg[0] | |
end | |
end | |
puts "\nBest group items" | |
best_group[:trees].each do |tree| | |
puts "index: #{tree[:index]}" | |
tree[:item].each do |key, value| | |
puts "#{key}:\t#{value}" | |
end | |
puts '' | |
end | |
puts "Found #{best_group[:trees].count} items" | |
puts "Time elapsed #{Time.now - beginning} seconds" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment