Skip to content

Instantly share code, notes, and snippets.

@WardCunningham
Last active August 29, 2015 14:18
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save WardCunningham/9792cb44076a5ca60c29 to your computer and use it in GitHub Desktop.
Save WardCunningham/9792cb44076a5ca60c29 to your computer and use it in GitHub Desktop.
Federation Scraper
logs
sites
.DS_Store
*.numbers
require 'net/http'
require 'json'
root = 'h2.ward.asia.wiki.org'
@fed = {}
@nxt = [root]
def queue site
unless @fed[site] or @nxt.include? site
@nxt << site
puts "queueing #{site} of #{@nxt.length}"
end
end
def contexts site, slug, page
begin
page['story'].each do |item|
yield item['site'] if item['site']
end
page['journal'].each do |action|
yield action['site'] if action['site']
end
rescue
puts "context #{site}/#{slug} rescued"
end
end
def tally site, pages
pages.each do |slug, page|
marks = {}
contexts site, slug, page do |ref|
puts "\t#{site}: #{page['title']} => #{ref}" unless marks[ref]
marks[ref] = true
queue ref
end
end
end
def sitemap site
begin
res = Net::HTTP.get_response site, '/system/sitemap.json'
if res.code == '200'
@fed[site] = slugs = JSON.parse(res.body).map {|each| each['slug']}
pages = {}
bytes = 0
slugs.each do |slug|
res = Net::HTTP.get_response site, "/#{slug}.json"
pages[slug] = JSON.parse(res.body)
# puts "fetch #{site} slug #{slug} got #{res.body.length} bytes"
bytes += res.body.length
sleep 0.2
end
puts "fetch #{site} sitemap got #{bytes} bytes"
yield site, pages
else
puts "fetch #{site} sitemap got code #{res.code}"
end
rescue
puts "fetch #{site} sitemap rescued"
end
end
def fetch site
begin
res = Net::HTTP.get_response site, '/system/export.json'
if res.code == '200'
puts "fetch #{site} export got #{res.body.length} bytes"
pages = JSON.parse res.body
@fed[site] = pages.keys
yield site, pages
else
puts "fetch #{site} export got code #{res.code}"
# sitemap site
end
rescue
puts "fetch #{site} export rescued"
end
end
max = 999
while @nxt.length > 0 and (max-=1) >= 0
fetch @nxt.shift do |site, pages|
File.open("sites/#{site}","w") do |file|
file.write JSON.pretty_generate pages
tally site, pages
end
end
end
puts "done"
require 'net/http'
require 'json'
root = 'h2.ward.asia.wiki.org'
@fed = {}
@nxt = [root]
def queue site
unless @fed[site] or @nxt.include? site
@nxt << site
puts "queueing #{site} of #{@nxt.length}"
end
end
def contexts site, slug, page
begin
page['story'].each do |item|
yield item['site'] if item['site']
end
page['journal'].each do |action|
yield action['site'] if action['site']
end
rescue
puts "context #{site}/#{slug} rescued"
end
end
def tally site, pages
pages.each do |slug, page|
marks = {}
contexts site, slug, page do |ref|
puts "\t#{site}: #{page['title']} => #{ref}" unless marks[ref]
marks[ref] = true
queue ref
end
end
end
def sitemap site
begin
res = Net::HTTP.get_response site, '/system/sitemap.json'
if res.code == '200'
@fed[site] = slugs = JSON.parse(res.body).map {|each| each['slug']}
pages = {}
bytes = 0
slugs.each do |slug|
res = Net::HTTP.get_response site, "/#{slug}.json"
pages[slug] = JSON.parse(res.body)
# puts "fetch #{site} slug #{slug} got #{res.body.length} bytes"
bytes += res.body.length
sleep 0.2
end
puts "fetch #{site} sitemap got #{bytes} bytes"
tally site, pages
else
puts "fetch #{site} sitemap got code #{res.code}"
end
rescue
puts "fetch #{site} sitemap rescued"
end
end
def fetch site
begin
res = Net::HTTP.get_response site, '/system/export.json'
if res.code == '200'
puts "fetch #{site} export got #{res.body.length} bytes"
pages = JSON.parse res.body
@fed[site] = pages.keys
tally site, pages
else
puts "fetch #{site} export got code #{res.code}"
sitemap site
end
rescue
puts "fetch #{site} export rescued"
end
end
max = 5
while @nxt.length > 0 and (max-=1) >= 0
fetch @nxt.shift
end
puts "done"
require 'json'
require 'csv'
require 'pp'
root = 'h2.ward.asia.wiki.org'
@fed = {}
@nxt = [root]
class Tally
def initialize
@rows = Hash.new {|h,k| h[k] = Hash.new(0)}
@columns = Hash.new(0)
end
def count row, column, n=1
@rows[row][column] += n
@columns[column] += 1
end
def save name
keys = @columns.keys.to_a
CSV.open "csv/#{name}.csv",'wb' do |file|
file << [name, keys].flatten
@rows.each do |row, hash|
file << [row, keys.collect {|key| hash[key]}].flatten
end
end
pp @rows
end
end
@sites = Tally.new
def queue site
unless @fed[site] or @nxt.include? site
@nxt << site
# puts "queueing #{site} of #{@nxt.length}"
end
end
def contexts site, slug, page
begin
page['story'].each do |item|
yield item['site'] if item['site']
end
page['journal'].each do |action|
yield action['site'] if action['site']
end
rescue
puts "context #{site}/#{slug} rescued"
end
end
def tally site, pages
cites = Hash.new(0)
pages.each do |slug, page|
@sites.count site, 'pages'
@sites.count site, 'actions',(page["journal"]||[]).length
@sites.count site, 'items',(page['story']||[]).length
marks = {}
contexts site, slug, page do |ref|
cites[ref] += 1
# puts "\t#{site}: #{page['title']} => #{ref}" unless marks[ref]
marks[ref] = true
queue ref
end
marks.keys.each do |cite|
@sites.count cite, 'citations' if @fed[cite]
end
end
@sites.count site, 'neighbors', cites.keys.size
end
def fetch site
begin
file = "sites/#{site}"
if File.exist? file
body = File.read file
puts "fetch #{site} export got #{body.length} bytes"
pages = JSON.parse body
@fed[site] = pages.keys
tally site, pages
else
puts "fetch #{site} not in cache"
end
rescue
puts "fetch #{site} fetch rescued"
end
end
puts "start"
max = 999
while @nxt.length > 0 and (max-=1) >= 0
fetch @nxt.shift
end
# pp @fed
@sites.save 'sites'
puts "done"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment