NilsHaldenwang/plain_text_extractor.rb

## plain_text_extractor.rb
require "rubygems"
require "nokogiri"

class PlainTextExtractor < Nokogiri::XML::SAX::Document

  attr_reader :plaintext

  # Initialize the state of interest variable with false
  def initialize
    @interesting = false
    @plaintext = ""
  end

  # This method is called whenever a comment occurs and
  # the comments text is passed in as string.
  def comment(string)
    case string.strip       # strip leading and trailing whitespaces
    when /^someComment/     # match starting comment
      @interesting = true
    when /^\/someComment/
      @interesting = false  # match closing comment
    end
  end

  # This callback method is called with any string between
  # a tag.
  def characters(string)
    @plaintext << string if @interesting
  end
end

pte = PlainTextExtractor.new
parser = Nokogiri::HTML::SAX::Parser.new(pte)
parser.parse_file ARGV[0]
puts pte.plaintext

## sample_page.html
<html>
  <head>
    <title>Some Title</title>
  </head>
  <body>
     <h2>Here goes some heading we are not interested in.</h2>

      <!-- someComment -->
        Here it goes. We are interested in this text. </br>
        But <b>some</b> words are wrapped with HTML-Tags we are <i>not</i>
        interested in.
        <a href="bar">Or links,..</a>
        <table>
          <tr>
            <td>Or a Table,...</td>
          </tr>
        </table>
      <!-- /someComment -->

      But we do NOT care about this.

      <!-- foo -->
        Even if it is wrapped in another comment.
      <!-- /foo -->
  </body>
</html>
	require "rubygems"
	require "nokogiri"

	class PlainTextExtractor < Nokogiri::XML::SAX::Document

	attr_reader :plaintext

	# Initialize the state of interest variable with false
	def initialize
	@interesting = false
	@plaintext = ""
	end

	# This method is called whenever a comment occurs and
	# the comments text is passed in as string.
	def comment(string)
	case string.strip # strip leading and trailing whitespaces
	when /^someComment/ # match starting comment
	@interesting = true
	when /^\/someComment/
	@interesting = false # match closing comment
	end
	end

	# This callback method is called with any string between
	# a tag.
	def characters(string)
	@plaintext << string if @interesting
	end
	end

	pte = PlainTextExtractor.new
	parser = Nokogiri::HTML::SAX::Parser.new(pte)
	parser.parse_file ARGV[0]
	puts pte.plaintext
	<html>
	<head>
	<title>Some Title</title>
	</head>
	<body>
	<h2>Here goes some heading we are not interested in.</h2>

	<!-- someComment -->
	Here it goes. We are interested in this text. </br>
	But <b>some</b> words are wrapped with HTML-Tags we are <i>not</i>
	interested in.
	<a href="bar">Or links,..</a>
	<table>
	<tr>
	<td>Or a Table,...</td>
	</tr>
	</table>
	<!-- /someComment -->

	But we do NOT care about this.

	<!-- foo -->
	Even if it is wrapped in another comment.
	<!-- /foo -->
	</body>
	</html>