Skip to content

Instantly share code, notes, and snippets.

@nkzsdy
Created February 16, 2019 05:32
Show Gist options
  • Save nkzsdy/ec274cec334478efa66592bddf21e8e2 to your computer and use it in GitHub Desktop.
Save nkzsdy/ec274cec334478efa66592bddf21e8e2 to your computer and use it in GitHub Desktop.
Scraping from MDN HTTP response status codes
require 'open-uri'
require 'nokogiri'
base_url = 'https://developer.mozilla.org'
doc = Nokogiri::HTML(open("#{base_url}/ja/docs/Web/HTTP/Status"))
target_nodes = doc.xpath('//*[@id="wikiArticle"]/dl')
all_results = []
target_nodes.each do |node|
node.children.css('dt').each do |item|
result = {}
if item.css('a').empty?
result[:title] = item.text
result[:status_code] = result[:title].split(" ")[0]
result[:detail_url] = nil
else
detail_path = item.css('a').attribute('href').value
result[:title] = item.css('a').text
result[:status_code] = result[:title].split(" ")[0]
result[:detail_url] = "#{base_url}#{detail_path}"
end
all_results << result
end
node.children.css('dd').each_with_index do |item, i|
all_results[i][:description] = item.children.inner_text
end
end
generated_xml = ""
all_results.each do |result|
if result[:detail_url]
partial = <<-EOS
<d:entry id="#{result[:status_code]}" d:title="#{result[:status_code]}">
<d:index d:value="#{result[:status_code]}" d:title="#{result[:status_code]}" d:yomi="#{result[:status_code]}" />
<h1>
<span class="headword">#{result[:status_code]}</span>
<span class="hyouki">【#{result[:title]}】</span>
</h1>
<span class="meaning">#{result[:description]}</span>
<a href="#{result[:detail_url]}">詳細</a>
</d:entry>
EOS
else
partial = <<-EOS
<d:entry id="#{result[:status_code]}" d:title="#{result[:status_code]}">
<d:index d:value="#{result[:status_code]}" d:title="#{result[:status_code]}" d:yomi="#{result[:status_code]}" />
<h1>
<span class="headword">#{result[:status_code]}</span>
<span class="hyouki">【#{result[:title]}】</span>
</h1>
<span class="meaning">#{result[:description]}</span>
</d:entry>
EOS
end
generated_xml << partial
end
File.open('HttpStatusCodes.xml', 'w') do |file|
content = <<-EOS
<?xml version="1.0" encoding="UTF-8"?>
<d:dictionary xmlns="http://www.w3.org/1999/xhtml" xmlns:d="http://www.apple.com/DTDs/DictionaryService-1.0.rng">
#{generated_xml}
</d:dictionary>
EOS
file.puts content
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment