Skip to content

Instantly share code, notes, and snippets.

@mertonium
Last active August 29, 2015 14:07
Show Gist options
  • Save mertonium/dbd81be86a473a0bc5e7 to your computer and use it in GitHub Desktop.
Save mertonium/dbd81be86a473a0bc5e7 to your computer and use it in GitHub Desktop.
Nokogiri custom pseudo selector
require 'nokogiri'
html_string = <<-HTML
<div id="columns">
<div class="col1">
<h4>October 01, 2014</h4><br />
<a name="063579" class="random-named-link"></a>
<h2>Some kind of heading</h2>
Start of the text we want.<br />
<br />
More of the text we want. <br />
<br />
Last chunk of text we want.
<p class="foo">Stuff we don't want.<br /></p>
<a href="http://example.com">Some link we don't want</a>
<div class="blah">
More stuff we don't want. And a <a href="">link</a> to.
</div>
<br />
<script src="http://somesocialtrackingwidget.com" type="text/javascript"></script>
<!-- Loads more divs we don't want -->
</div>
<div class="col2">
A whole column of stuff we don't want.
</div>
</div>
HTML
def clean_whitespace(str)
str.strip.gsub(/[[:space:]]/, ' ').gsub(/\s+/, ' ')
end
describe "#children_text_nodes" do
let(:dom) { Nokogiri::HTML.parse(html_string) }
let(:expected_result) do
[
'Start of the text we want.',
'More of the text we want.',
'Last chunk of text we want.'
].join(' ')
end
it "extracts the text nodes from the given node set (works)" do
content_nodes = dom.css("#columns .col1:children_text_nodes", Class.new {
def children_text_nodes(node_set)
node_set.css('*').unlink
node_set
end
}.new)
results = content_nodes.collect{ |n| clean_whitespace(n.content) }.join(' ')
expect(results).to eq(expected_result)
end
it "extracts the text nodes from the given node set (doesn't work)" do
content_nodes = dom.css("#columns .col1:children_text_nodes", Class.new {
def children_text_nodes(node_set)
node_set.children.select{ |node| node.class == Nokogiri::XML::Text }
end
}.new)
results = content_nodes.collect{ |n| clean_whitespace(n.content) }.join(' ')
expect(results).to eq(expected_result)
end
end
source 'https://rubygems.org'
gem "nokogiri", "1.6.1"
gem "rspec", "~>2.14.0"
GEM
remote: https://rubygems.org/
specs:
diff-lcs (1.2.5)
mini_portile (0.5.3)
nokogiri (1.6.1)
mini_portile (~> 0.5.0)
rspec (2.14.1)
rspec-core (~> 2.14.0)
rspec-expectations (~> 2.14.0)
rspec-mocks (~> 2.14.0)
rspec-core (2.14.8)
rspec-expectations (2.14.5)
diff-lcs (>= 1.1.3, < 2.0)
rspec-mocks (2.14.6)
PLATFORMS
ruby
DEPENDENCIES
nokogiri (= 1.6.1)
rspec (~> 2.14.0)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment