Service manual prototype hierarchy crawler
require 'nokogiri' | |
require 'open-uri' | |
require 'pry' | |
require 'yaml' | |
class Crawl | |
def initialize(start_url = "http://sm-11.herokuapp.com") | |
@start_url = start_url | |
@guides = [] | |
end | |
def go | |
File.open('section_hierarchy.yml', 'w') { |file| file.write(sections.to_yaml) } | |
File.open('guides.yml', 'w') { |file| file.write(@guides.to_yaml) } | |
end | |
def sections | |
doc = Nokogiri::HTML(open(@start_url)) | |
doc.css(".topic-block").map do |section_block| | |
base_path = service_manual_path(section_block.css(".heading-small a").attr('href').value) | |
{ | |
content_id: SecureRandom.uuid, | |
title: section_block.css(".heading-small").text().strip, | |
base_path: base_path, | |
description: section_block.css("p").text().strip, | |
format: "service_manual_topic", | |
publishing_app: "service-manual-publisher", | |
rendering_app: "government-frontend", | |
need_ids: [], | |
locale: "en", | |
update_type: 'minor', | |
public_updated_at: Time.now.iso8601, | |
details: { | |
link_groups: link_groups(section_block.css(".heading-small a").attr('href').value) | |
}, | |
links: { | |
linked_items: @links | |
}, | |
routes: [ | |
{ type: "exact", path: base_path } | |
], | |
} | |
end | |
end | |
def link_groups(section_path) | |
@links = [] | |
page_url = File.join(@start_url, section_path) | |
doc = Nokogiri::HTML(open(page_url)) | |
doc.css(".collapsible-subsections ul").map.with_index do |block, idx| | |
description = doc.css(".topic-description")[idx].text().strip | |
{ | |
title: doc.css(".collapsible-subsections h2")[idx].text().strip.gsub(description, ''), # both title and description live under h2 | |
description: description, | |
linked_items: linked_items(block) | |
} | |
end | |
end | |
def linked_items(block) | |
block.css("a").map do |link| | |
guide = add_guide(link) | |
if guide | |
@links << guide[:content_id] | |
guide[:content_id] | |
end | |
end.compact | |
end | |
def service_manual_path(path) | |
File.join( | |
"/service-manual", | |
path.chomp("/") | |
) | |
end | |
def add_guide(link) | |
page_url = File.join(@start_url, link.attr('href')) | |
puts page_url | |
doc = Nokogiri::HTML(open(page_url)) | |
body_block = doc.css(".markdown") | |
guide = { | |
content_id: SecureRandom.uuid, | |
title: link.text, | |
description: "-", | |
format: "service_manual_guide", | |
publishing_app: "service-manual-publisher", | |
rendering_app: "government-frontend", | |
need_ids: [], | |
locale: "en", | |
updated_at: Time.now.iso8601, | |
public_updated_at: Time.now.iso8601, | |
update_type: "minor", | |
phase: "beta", | |
base_path: service_manual_path(link.attr('href')), | |
routes: [ | |
{ type: "exact", path: service_manual_path(link.attr('href')) } | |
], | |
details: { | |
body: body_block.to_s, | |
header_links: [], | |
publisher: { | |
name: "Agile Community", | |
href: "http://sm-11.herokuapp.com/agile-delivery/agile-and-government-services" | |
} | |
} | |
} | |
@guides << guide | |
guide | |
rescue OpenURI::HTTPError => e | |
puts e.message | |
end | |
end | |
Crawl.new.go |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment