zbage/gist:198173

## gistfile1.rb
def plain_text(url)
  doc = Nokogiri::HTML(open(url))
                                   # convert the xml doc to a string
  buffer = doc.xpath("//body").to_xml.to_s

  b1 = buffer.gsub(/(\n)|\s+/,' ') # strip out carriage returns and extra space
                                   # parse the string as a REXML document
  doc4 = Document.new(b1.gsub(/>\s+</,'><'))

                                   # select all text elements
  c = XPath.match(doc4, "//[name() != 'script']/text()").join(' ')

  c2 = c.gsub(/&#\w+;/,'')         # strip out html codes
                                   # strip out words with mid symbols
  d =  c2.gsub(/(\w+[\w&)\.\\][\/\.&#;@][\w;\.\=:]\w+)/,'')
                                   # strip out numbers
  d2 = d.gsub(/(\b\d[\d\w]*\b)/,'')
                                   # split words which have a hyphen
  d2b = d2.gsub(/\w(-)\w/) {|x| x.sub(/-/,' ')}
                                   # strip out isolated symbols or characters
  d3 = d2b.gsub(/([-\=:()#'\/\?\|"\\,\.;&\!\[\]])/,'')
  d4 = d3.gsub(/\w{15,}/,'')       # strip away any long erroneous words
  d4b = d4.gsub(/\b\w\b/,'')       # strip out single characters
  d5 = d4b.gsub(/\s+/,' ')         # strip out extra spaces
  d6 = d5.downcase                 # switch to lowercase
end
	def plain_text(url)
	doc = Nokogiri::HTML(open(url))
	# convert the xml doc to a string
	buffer = doc.xpath("//body").to_xml.to_s

	b1 = buffer.gsub(/(\n)\|\s+/,' ') # strip out carriage returns and extra space
	# parse the string as a REXML document
	doc4 = Document.new(b1.gsub(/>\s+</,'><'))

	# select all text elements
	c = XPath.match(doc4, "//[name() != 'script']/text()").join(' ')

	c2 = c.gsub(/&#\w+;/,'') # strip out html codes
	# strip out words with mid symbols
	d = c2.gsub(/(\w+[\w&)\.\\][\/\.&#;@][\w;\.\=:]\w+)/,'')
	# strip out numbers
	d2 = d.gsub(/(\b\d[\d\w]*\b)/,'')
	# split words which have a hyphen
	d2b = d2.gsub(/\w(-)\w/) {\|x\| x.sub(/-/,' ')}
	# strip out isolated symbols or characters
	d3 = d2b.gsub(/([-\=:()#'\/\?\\|"\\,\.;&\!\[\]])/,'')
	d4 = d3.gsub(/\w{15,}/,'') # strip away any long erroneous words
	d4b = d4.gsub(/\b\w\b/,'') # strip out single characters
	d5 = d4b.gsub(/\s+/,' ') # strip out extra spaces
	d6 = d5.downcase # switch to lowercase
	end