require 'net/http' | |
require 'uri' | |
require 'json' | |
require 'cgi' | |
def lcs(a, b) | |
lengths = Array.new(a.length) { Array.new(b.length, 0) } | |
greatest_length = 0 | |
output = '' | |
a.each_char.with_index do |x, i| | |
b.each_char.with_index do |y, j| | |
next if x != y | |
lengths[i][j] = i.zero? || j.zero? ? 1 : lengths[i - 1][j - 1] + 1 | |
if lengths[i][j] > greatest_length | |
greatest_length = lengths[i][j] | |
output = a[i - greatest_length + 1, greatest_length] | |
end | |
end | |
end | |
output | |
end | |
def stitch(current, lines) | |
unused = [] | |
for line in lines | |
next if line == current | |
lcs = lcs(line, current) | |
if lcs.length < line.length / 5 | |
unused << line | |
next | |
end | |
slices = line.split(lcs) | |
current = slices[0] + current unless slices[0].nil? | |
current += slices[1] unless slices[1].nil? | |
end | |
[current] + unused | |
end | |
def page(stitched) | |
last_stitched = 0 | |
no_progress = 0 | |
loop do | |
stitched = stitch(stitched[0], stitched) | |
no_progress += 1 if stitched.length == last_stitched | |
last_stitched = stitched.length | |
break if no_progress > 5 | |
end | |
stitched | |
end | |
fragments_file = File.read('full.json') | |
fragments = JSON.parse(fragments_file) | |
@pages = {} | |
@threads = [] | |
fragments.each_pair do |k, v| | |
text = page(v)[0] | |
@pages[k] = text | |
puts "#{@pages.length}/#{fragments.length}" | |
end | |
File.open('pages.txt', 'w') do |f| | |
sorted = Hash[@pages.sort_by { |k, v| Integer(k) }] | |
sorted.each_pair do |k, v| | |
f.puts "Page #{k}\n" | |
f.puts v | |
f.puts "\n" | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment