Skip to content

Instantly share code, notes, and snippets.

@tadast
Last active November 9, 2015 13:56
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save tadast/defd1b6fc1e2eccd77f3 to your computer and use it in GitHub Desktop.
Save tadast/defd1b6fc1e2eccd77f3 to your computer and use it in GitHub Desktop.
Service manual prototype hierarchy crawler
require 'nokogiri'
require 'open-uri'
require 'pry'
require 'yaml'
class Crawl
def initialize(start_url = "http://sm-11.herokuapp.com")
@start_url = start_url
@guides = []
end
def go
File.open('section_hierarchy.yml', 'w') { |file| file.write(sections.to_yaml) }
File.open('guides.yml', 'w') { |file| file.write(@guides.to_yaml) }
end
def sections
doc = Nokogiri::HTML(open(@start_url))
doc.css(".topic-block").map do |section_block|
base_path = service_manual_path(section_block.css(".heading-small a").attr('href').value)
{
content_id: SecureRandom.uuid,
title: section_block.css(".heading-small").text().strip,
base_path: base_path,
description: section_block.css("p").text().strip,
format: "service_manual_topic",
publishing_app: "service-manual-publisher",
rendering_app: "government-frontend",
need_ids: [],
locale: "en",
update_type: 'minor',
public_updated_at: Time.now.iso8601,
details: {
link_groups: link_groups(section_block.css(".heading-small a").attr('href').value)
},
links: {
linked_items: @links
},
routes: [
{ type: "exact", path: base_path }
],
}
end
end
def link_groups(section_path)
@links = []
page_url = File.join(@start_url, section_path)
doc = Nokogiri::HTML(open(page_url))
doc.css(".collapsible-subsections ul").map.with_index do |block, idx|
description = doc.css(".topic-description")[idx].text().strip
{
title: doc.css(".collapsible-subsections h2")[idx].text().strip.gsub(description, ''), # both title and description live under h2
description: description,
linked_items: linked_items(block)
}
end
end
def linked_items(block)
block.css("a").map do |link|
guide = add_guide(link)
if guide
@links << guide[:content_id]
guide[:content_id]
end
end.compact
end
def service_manual_path(path)
File.join(
"/service-manual",
path.chomp("/")
)
end
def add_guide(link)
page_url = File.join(@start_url, link.attr('href'))
puts page_url
doc = Nokogiri::HTML(open(page_url))
body_block = doc.css(".markdown")
guide = {
content_id: SecureRandom.uuid,
title: link.text,
description: "-",
format: "service_manual_guide",
publishing_app: "service-manual-publisher",
rendering_app: "government-frontend",
need_ids: [],
locale: "en",
updated_at: Time.now.iso8601,
public_updated_at: Time.now.iso8601,
update_type: "minor",
phase: "beta",
base_path: service_manual_path(link.attr('href')),
routes: [
{ type: "exact", path: service_manual_path(link.attr('href')) }
],
details: {
body: body_block.to_s,
header_links: [],
publisher: {
name: "Agile Community",
href: "http://sm-11.herokuapp.com/agile-delivery/agile-and-government-services"
}
}
}
@guides << guide
guide
rescue OpenURI::HTTPError => e
puts e.message
end
end
Crawl.new.go
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment