ZempTime/scraping_ideas.rb

## scraping_ideas.rb
# from http://blog.rubyroidlabs.com/2016/04/web-scraping-2/

def strip_bad_chars(text)
  text.gsub!(/"/, "'");
  text.gsub!(/\u2018/, "'");
  text.gsub!(/[”“]/, '"');
  text.gsub!(/’/, "'");
  text
end

def clean_body(text)
  text.gsub!(/(\r)?\n/, "");
  text.gsub!(/\s+/, ' ');
end

text = Sanitize.clean(text, :elements => ['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'a', 'b', 'strong', 'em', 'img', 'iframe'],
  :attributes => {
    'a' => ['href', 'title', 'name'],
    'img' => ['src', 'title', 'alt'],
    'iframe' => ['src', 'url', 'class', 'id', 'width', 'height', 'name'],
  },
  :protocols => {
    'a' => {
    'href' => ['http', 'https', 'mailto']
  },
    'iframe' => {
      'src' => ['http', 'https']
    }
  })
  # clean start and end whitespace
  text = text.strip;
  return text
end

# Principles:
#   - clean the cruft, rid the nonessential from the page at the start
#   - make specific parts on the page easily and clearly callable
#   - there is data cleaning, data organization, then data extraction. these three phases of logic are clearer if they're distinct
#   - goal is to write logic that you will understand when you return to it in 6 months.
#   - unlike most other places in code, comments are probably necessary for wonky web pages.
#   - lean upon CSS selectors where possible
	# from http://blog.rubyroidlabs.com/2016/04/web-scraping-2/

	def strip_bad_chars(text)
	text.gsub!(/"/, "'");
	text.gsub!(/\u2018/, "'");
	text.gsub!(/[”“]/, '"');
	text.gsub!(/’/, "'");
	text
	end

	def clean_body(text)
	text.gsub!(/(\r)?\n/, "");
	text.gsub!(/\s+/, ' ');
	end

	text = Sanitize.clean(text, :elements => ['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'a', 'b', 'strong', 'em', 'img', 'iframe'],
	:attributes => {
	'a' => ['href', 'title', 'name'],
	'img' => ['src', 'title', 'alt'],
	'iframe' => ['src', 'url', 'class', 'id', 'width', 'height', 'name'],
	},
	:protocols => {
	'a' => {
	'href' => ['http', 'https', 'mailto']
	},
	'iframe' => {
	'src' => ['http', 'https']
	}
	})
	# clean start and end whitespace
	text = text.strip;
	return text
	end

	# Principles:
	# - clean the cruft, rid the nonessential from the page at the start
	# - make specific parts on the page easily and clearly callable
	# - there is data cleaning, data organization, then data extraction. these three phases of logic are clearer if they're distinct
	# - goal is to write logic that you will understand when you return to it in 6 months.
	# - unlike most other places in code, comments are probably necessary for wonky web pages.
	# - lean upon CSS selectors where possible