savetheclocktower/scrape_mdn.rb

## scrape_mdn.rb
#!/usr/bin/env ruby

# Scrape the JavaScript documentation on Mozilla Developer and store an index
# as a YAML file.
#
# The data is organized by "token." If someone hits ^H in the middle of the
# word "toUpperCase", we want to link them to the MDN page for
# `String.prototype.toUpperCase`. If they're in the middle of the word
# "indexOf", we want to let them choose between `Array.prototype.indexOf` and
# `String.prototype.indexOf`.
#
# So the YAML dump is a giant hash with the token as the key and arrays as
# values containing all possible options for that token. (YAML is the storage
# format rather than JSON because Ruby 1.8 doesn't have built-in JSON.)
#
# The point is not to generate an _exhaustive_ index, because there are a lot
# of obscure APIs out there. It's to cover the stuff that gets used most
# often.
#
# END USERS WILL NOT HAVE TO RUN THIS SCRIPT. This is the script that
# generates the YAML file, which should then be checked into source control.
# It ought to be regenerated if there's a major new addition to the language
# (or to the Web APIs) that's worth including.

require 'nokogiri'
require 'yaml'
require 'open-uri'

# Nonexistent entries in the hash get initialized as empty arrays.
$result = Hash.new { |hash, key| hash[key] = [] }

ALIASES = {
  'async' => 'async function',
  'catch' => 'try...catch',
  'do'    => 'do...while',
  'else'  => 'if...else',
  'if'    => 'if...else',
  'in'    => 'for...in',
  'of'    => 'for...of',
  'try'   => 'try...catch'
}

def aliased(token)
  ALIASES[token] || token
end

def url(path)
  "https://developer.mozilla.org/en-US/docs/Web#{path}"
end

# /JavaScript/Reference/Global_Objects/$term
GLOBAL_FUNCTIONS = [
  'decodeURI',
  'decodeURIComponent',
  'encodeURI',
  'encodeURIComponent',
  'eval',
  'isFinite',
  'isNaN',
  'parseFloat',
  'parseInt'
]

# /JavaScript/Reference/Global_Objects/$term
GLOBAL_OBJECTS = [
  'undefined',
  'null',
  'NaN',
  'Infinity'
]

(GLOBAL_FUNCTIONS + GLOBAL_OBJECTS).each do |token|
  gf = aliased(token)
  $result[token] << {
    :name => gf,
    :url => "/JavaScript/Reference/Global_Objects/#{gf}"
  }
end

# /JavaScript/Reference/Operators/$term
OPERATORS = [
  'await',
  'delete',
  'get',
  'in',
  'instanceof',
  'let',
  'new',
  'set',
  'this',
  'typeof',
  'void',
  'yield*'
]

OPERATORS.each do |token|
  op = aliased(token)
  $result[token] << {
    :name => op,
    :url => "/JavaScript/Reference/Operators/#{op}"
  }
end

STATEMENTS = [
  'async',   # async function
  'break',
  'catch',   # try...catch
  'class',
  'const',
  'const',
  'continue',
  'debugger',
  'do',      # do...while
  'else',    # if...else
  'export',
  'for',
  'function',
  'function*',
  'if',      # if...else
  'import',
  'in',      # for...in
  'label',
  'let',
  'let',
  'of',      # for...of
  'return',
  'switch',
  'throw',
  'try',     # try...catch
  'var',
  'while',
  'with',
  'yield'
]

STATEMENTS.each do |token|
  title = aliased(token)
  $result[title] << {
    :name => token,
    :url => "/JavaScript/Reference/Statements/#{title}"
  }
end

BUILTINS = [
  'Array',
  'Date',
  'Function',
  'Error',
  'Boolean',
  'JSON',
  'Map',
  'Number',
  'Object',
  'Promise',
  'Proxy',
  'RangeError',
  'ReferenceError',
  'RegExp',
  'Set',
  'String',
  'Symbol',
  'SyntaxError',
  'TypeError',
  'WeakMap',
  'WeakSet',
]

# Other tokens that should be included, along with the pages they should
# point to.
MISC = {
  :Image => '/API/HTMLImageElement/Image',
}

MISC.each do |token, v|
  k = aliased(token.to_s)
  $result[token.to_s] << { :name => k, :url => v }
end

# Everything in TOC will be screen-scraped for links to methods/properties.
#
# The keys are for reference, but aren't used as tokens.
#
TOC = {
  # The rest of the builtins have their instance methods listed on their base
  # pages, but Array doesn't, for whatever reason.
  :"Array.prototype" => '/JavaScript/Reference/Global_Objects/Array/prototype',

  :window   => '/API/window',
  :document => '/API/document',
  :console  => '/API/console',

  # If we scrape _all_ DOM classes, we get way too many dupes. This seems
  # like a good subset to start off with.
  :Node           => '/API/Node',
  :Event          => '/API/Event',
  :HTMLElement    => '/API/HTMLElement',
  :HTMLCollection => '/API/HTMLCollection',
}

# Everything in BUILTINS should also get scraped.
BUILTINS.each do |builtin|
  TOC[builtin.to_sym] = "/JavaScript/Reference/Global_Objects/#{builtin}"
end

TOC.each do |key, value|
  $result[key.to_s] << { :name => key.to_s, :url => value }

  STDERR.puts "Crawling #{key}: #{url(value)}"
  begin
    doc = Nokogiri::HTML( open( url(value) ) )
  rescue OpenURI::HTTPError => e
    STDERR.puts "404: #{url(value)}"
    exit 1
  end

  count = 0

  # Thank god for semantic markup. All items are in definition lists.
  doc.css('dt').each do |dt|
    # Skip the ones that don't have wiki pages.
    next if dt.at_css('a.new')

    # Skip non-standard stuff. The title text will say something like
    # "...this is not standardized."
    next if dt.at_css('span[title*="standardized"]')

    # All methods and properties are inside CODE tags, so anything that
    # doesn't have one is a false positive.
    next unless dt.at_css('a > code')

    name = dt.at_css('a > code').text
    url = dt.at_css('a')['href']

    token = name.split('.').last.gsub(/\(.*?\)/, '')

    $result[token] << {
      name: name,
      url: url.gsub('/en-US/docs/Web', '')
    }
    count += 1
  end
  STDERR.puts " found #{count}"
end

puts YAML::dump($result)
	#!/usr/bin/env ruby

	# Scrape the JavaScript documentation on Mozilla Developer and store an index
	# as a YAML file.
	#
	# The data is organized by "token." If someone hits ^H in the middle of the
	# word "toUpperCase", we want to link them to the MDN page for
	# `String.prototype.toUpperCase`. If they're in the middle of the word
	# "indexOf", we want to let them choose between `Array.prototype.indexOf` and
	# `String.prototype.indexOf`.
	#
	# So the YAML dump is a giant hash with the token as the key and arrays as
	# values containing all possible options for that token. (YAML is the storage
	# format rather than JSON because Ruby 1.8 doesn't have built-in JSON.)
	#
	# The point is not to generate an _exhaustive_ index, because there are a lot
	# of obscure APIs out there. It's to cover the stuff that gets used most
	# often.
	#
	# END USERS WILL NOT HAVE TO RUN THIS SCRIPT. This is the script that
	# generates the YAML file, which should then be checked into source control.
	# It ought to be regenerated if there's a major new addition to the language
	# (or to the Web APIs) that's worth including.

	require 'nokogiri'
	require 'yaml'
	require 'open-uri'

	# Nonexistent entries in the hash get initialized as empty arrays.
	$result = Hash.new { \|hash, key\| hash[key] = [] }

	ALIASES = {
	'async' => 'async function',
	'catch' => 'try...catch',
	'do' => 'do...while',
	'else' => 'if...else',
	'if' => 'if...else',
	'in' => 'for...in',
	'of' => 'for...of',
	'try' => 'try...catch'
	}

	def aliased(token)
	ALIASES[token] \|\| token
	end

	def url(path)
	"https://developer.mozilla.org/en-US/docs/Web#{path}"
	end

	# /JavaScript/Reference/Global_Objects/$term
	GLOBAL_FUNCTIONS = [
	'decodeURI',
	'decodeURIComponent',
	'encodeURI',
	'encodeURIComponent',
	'eval',
	'isFinite',
	'isNaN',
	'parseFloat',
	'parseInt'
	]

	# /JavaScript/Reference/Global_Objects/$term
	GLOBAL_OBJECTS = [
	'undefined',
	'null',
	'NaN',
	'Infinity'
	]

	(GLOBAL_FUNCTIONS + GLOBAL_OBJECTS).each do \|token\|
	gf = aliased(token)
	$result[token] << {
	:name => gf,
	:url => "/JavaScript/Reference/Global_Objects/#{gf}"
	}
	end

	# /JavaScript/Reference/Operators/$term
	OPERATORS = [
	'await',
	'delete',
	'get',
	'in',
	'instanceof',
	'let',
	'new',
	'set',
	'this',
	'typeof',
	'void',
	'yield*'
	]

	OPERATORS.each do \|token\|
	op = aliased(token)
	$result[token] << {
	:name => op,
	:url => "/JavaScript/Reference/Operators/#{op}"
	}
	end

	STATEMENTS = [
	'async', # async function
	'break',
	'catch', # try...catch
	'class',
	'const',
	'const',
	'continue',
	'debugger',
	'do', # do...while
	'else', # if...else
	'export',
	'for',
	'function',
	'function*',
	'if', # if...else
	'import',
	'in', # for...in
	'label',
	'let',
	'let',
	'of', # for...of
	'return',
	'switch',
	'throw',
	'try', # try...catch
	'var',
	'while',
	'with',
	'yield'
	]

	STATEMENTS.each do \|token\|
	title = aliased(token)
	$result[title] << {
	:name => token,
	:url => "/JavaScript/Reference/Statements/#{title}"
	}
	end

	BUILTINS = [
	'Array',
	'Date',
	'Function',
	'Error',
	'Boolean',
	'JSON',
	'Map',
	'Number',
	'Object',
	'Promise',
	'Proxy',
	'RangeError',
	'ReferenceError',
	'RegExp',
	'Set',
	'String',
	'Symbol',
	'SyntaxError',
	'TypeError',
	'WeakMap',
	'WeakSet',
	]

	# Other tokens that should be included, along with the pages they should
	# point to.
	MISC = {
	:Image => '/API/HTMLImageElement/Image',
	}

	MISC.each do \|token, v\|
	k = aliased(token.to_s)
	$result[token.to_s] << { :name => k, :url => v }
	end

	# Everything in TOC will be screen-scraped for links to methods/properties.
	#
	# The keys are for reference, but aren't used as tokens.
	#
	TOC = {
	# The rest of the builtins have their instance methods listed on their base
	# pages, but Array doesn't, for whatever reason.
	:"Array.prototype" => '/JavaScript/Reference/Global_Objects/Array/prototype',

	:window => '/API/window',
	:document => '/API/document',
	:console => '/API/console',

	# If we scrape _all_ DOM classes, we get way too many dupes. This seems
	# like a good subset to start off with.
	:Node => '/API/Node',
	:Event => '/API/Event',
	:HTMLElement => '/API/HTMLElement',
	:HTMLCollection => '/API/HTMLCollection',
	}

	# Everything in BUILTINS should also get scraped.
	BUILTINS.each do \|builtin\|
	TOC[builtin.to_sym] = "/JavaScript/Reference/Global_Objects/#{builtin}"
	end

	TOC.each do \|key, value\|
	$result[key.to_s] << { :name => key.to_s, :url => value }

	STDERR.puts "Crawling #{key}: #{url(value)}"
	begin
	doc = Nokogiri::HTML( open( url(value) ) )
	rescue OpenURI::HTTPError => e
	STDERR.puts "404: #{url(value)}"
	exit 1
	end

	count = 0

	# Thank god for semantic markup. All items are in definition lists.
	doc.css('dt').each do \|dt\|
	# Skip the ones that don't have wiki pages.
	next if dt.at_css('a.new')

	# Skip non-standard stuff. The title text will say something like
	# "...this is not standardized."
	next if dt.at_css('span[title*="standardized"]')

	# All methods and properties are inside CODE tags, so anything that
	# doesn't have one is a false positive.
	next unless dt.at_css('a > code')

	name = dt.at_css('a > code').text
	url = dt.at_css('a')['href']

	token = name.split('.').last.gsub(/\(.*?\)/, '')

	$result[token] << {
	name: name,
	url: url.gsub('/en-US/docs/Web', '')
	}
	count += 1
	end
	STDERR.puts " found #{count}"
	end

	puts YAML::dump($result)