Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
Service manual prototype hierarchy crawler
require 'nokogiri'
require 'open-uri'
require 'pry'
require 'yaml'
class Crawl
def initialize(start_url = "http://sm-11.herokuapp.com")
@start_url = start_url
@guides = []
end
def go
File.open('section_hierarchy.yml', 'w') { |file| file.write(sections.to_yaml) }
File.open('guides.yml', 'w') { |file| file.write(@guides.to_yaml) }
end
def sections
doc = Nokogiri::HTML(open(@start_url))
doc.css(".topic-block").map do |section_block|
base_path = service_manual_path(section_block.css(".heading-small a").attr('href').value)
{
content_id: SecureRandom.uuid,
title: section_block.css(".heading-small").text().strip,
base_path: base_path,
description: section_block.css("p").text().strip,
format: "service_manual_topic",
publishing_app: "service-manual-publisher",
rendering_app: "government-frontend",
need_ids: [],
locale: "en",
update_type: 'minor',
public_updated_at: Time.now.iso8601,
details: {
link_groups: link_groups(section_block.css(".heading-small a").attr('href').value)
},
links: {
linked_items: @links
},
routes: [
{ type: "exact", path: base_path }
],
}
end
end
def link_groups(section_path)
@links = []
page_url = File.join(@start_url, section_path)
doc = Nokogiri::HTML(open(page_url))
doc.css(".collapsible-subsections ul").map.with_index do |block, idx|
description = doc.css(".topic-description")[idx].text().strip
{
title: doc.css(".collapsible-subsections h2")[idx].text().strip.gsub(description, ''), # both title and description live under h2
description: description,
linked_items: linked_items(block)
}
end
end
def linked_items(block)
block.css("a").map do |link|
guide = add_guide(link)
if guide
@links << guide[:content_id]
guide[:content_id]
end
end.compact
end
def service_manual_path(path)
File.join(
"/service-manual",
path.chomp("/")
)
end
def add_guide(link)
page_url = File.join(@start_url, link.attr('href'))
puts page_url
doc = Nokogiri::HTML(open(page_url))
body_block = doc.css(".markdown")
guide = {
content_id: SecureRandom.uuid,
title: link.text,
description: "-",
format: "service_manual_guide",
publishing_app: "service-manual-publisher",
rendering_app: "government-frontend",
need_ids: [],
locale: "en",
updated_at: Time.now.iso8601,
public_updated_at: Time.now.iso8601,
update_type: "minor",
phase: "beta",
base_path: service_manual_path(link.attr('href')),
routes: [
{ type: "exact", path: service_manual_path(link.attr('href')) }
],
details: {
body: body_block.to_s,
header_links: [],
publisher: {
name: "Agile Community",
href: "http://sm-11.herokuapp.com/agile-delivery/agile-and-government-services"
}
}
}
@guides << guide
guide
rescue OpenURI::HTTPError => e
puts e.message
end
end
Crawl.new.go
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.