Created
February 16, 2019 05:32
-
-
Save nkzsdy/ec274cec334478efa66592bddf21e8e2 to your computer and use it in GitHub Desktop.
Scraping from MDN HTTP response status codes
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require 'open-uri' | |
require 'nokogiri' | |
base_url = 'https://developer.mozilla.org' | |
doc = Nokogiri::HTML(open("#{base_url}/ja/docs/Web/HTTP/Status")) | |
target_nodes = doc.xpath('//*[@id="wikiArticle"]/dl') | |
all_results = [] | |
target_nodes.each do |node| | |
node.children.css('dt').each do |item| | |
result = {} | |
if item.css('a').empty? | |
result[:title] = item.text | |
result[:status_code] = result[:title].split(" ")[0] | |
result[:detail_url] = nil | |
else | |
detail_path = item.css('a').attribute('href').value | |
result[:title] = item.css('a').text | |
result[:status_code] = result[:title].split(" ")[0] | |
result[:detail_url] = "#{base_url}#{detail_path}" | |
end | |
all_results << result | |
end | |
node.children.css('dd').each_with_index do |item, i| | |
all_results[i][:description] = item.children.inner_text | |
end | |
end | |
generated_xml = "" | |
all_results.each do |result| | |
if result[:detail_url] | |
partial = <<-EOS | |
<d:entry id="#{result[:status_code]}" d:title="#{result[:status_code]}"> | |
<d:index d:value="#{result[:status_code]}" d:title="#{result[:status_code]}" d:yomi="#{result[:status_code]}" /> | |
<h1> | |
<span class="headword">#{result[:status_code]}</span> | |
<span class="hyouki">【#{result[:title]}】</span> | |
</h1> | |
<span class="meaning">#{result[:description]}</span> | |
<a href="#{result[:detail_url]}">詳細</a> | |
</d:entry> | |
EOS | |
else | |
partial = <<-EOS | |
<d:entry id="#{result[:status_code]}" d:title="#{result[:status_code]}"> | |
<d:index d:value="#{result[:status_code]}" d:title="#{result[:status_code]}" d:yomi="#{result[:status_code]}" /> | |
<h1> | |
<span class="headword">#{result[:status_code]}</span> | |
<span class="hyouki">【#{result[:title]}】</span> | |
</h1> | |
<span class="meaning">#{result[:description]}</span> | |
</d:entry> | |
EOS | |
end | |
generated_xml << partial | |
end | |
File.open('HttpStatusCodes.xml', 'w') do |file| | |
content = <<-EOS | |
<?xml version="1.0" encoding="UTF-8"?> | |
<d:dictionary xmlns="http://www.w3.org/1999/xhtml" xmlns:d="http://www.apple.com/DTDs/DictionaryService-1.0.rng"> | |
#{generated_xml} | |
</d:dictionary> | |
EOS | |
file.puts content | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment