Created
September 26, 2012 20:52
-
-
Save siers/3790516 to your computer and use it in GitHub Desktop.
Savelk kompaktus pasaku HTMLus.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env ruby | |
# encoding: utf-8 | |
require 'nokogiri' | |
require 'pry' | |
def wrap(title, content) | |
<<-TEMPLATE | |
<!DOCTYPE html> | |
<html> | |
<head> | |
<title></title> | |
<meta charset="windows-1257" /> | |
</head> | |
<body> | |
<h1>#{ title.gsub('-', ' ').capitalize }</h1> | |
#{ content } | |
</body> | |
</html> | |
TEMPLATE | |
end | |
`mkdir -p output raw` | |
lines = File::open('liste') { |f| f.each_line.to_a }.map(&:strip!) | |
lines.each do |url| | |
title = url.match(/([a-z\-]+)\/$/)[1] | |
fname = "raw/#{ title }.html" | |
content = File::open(fname).read rescue nil | |
content ||= File::open(fname, 'wb+') do |f| | |
`curl #{ url } 2> /dev/null`.tap do |contents| | |
f.write(contents) | |
end | |
end | |
pasaka = Nokogiri::HTML(content).css("#pageText").children.to_s.encode('windows-1257') rescue next # Just fuck it. | |
File::open("output/#{ title }.html", 'wb+').write(wrap title, pasaka) | |
putc '.' | |
end | |
puts "\nDarīts." | |
__END__ | |
$ for i in {1..4}; do curl "http://www.pasakas.net/pasakas/citu_tautu_pasakas/lietuviesu-pasakas/?page=$i" >> x; done | |
$ ack -o '(?<=href=.).*citu_tautu_pasakas/lietuviesu-pasakas/[^\?"#][^"]{4,}' x | ack -v lang | sort -u liste | |
$ ./export | |
$ htmldoc -f Pasakas.pdf --charset cp-1257 --header \ --continuous output/* | |
listi taisiet paši, šādā formātā: | |
http://www.pasakas.net/pasakas/citu_tautu_pasakas/lietuviesu-pasakas/barenitis-un-raganas/ | |
http://www.pasakas.net/pasakas/citu_tautu_pasakas/lietuviesu-pasakas/gailitis-un-vistina/ | |
http://www.pasakas.net/pasakas/citu_tautu_pasakas/lietuviesu-pasakas/ikstitis/ | |
http://www.pasakas.net/pasakas/citu_tautu_pasakas/lietuviesu-pasakas/jonuks-un-elenite/ | |
http://www.pasakas.net/pasakas/citu_tautu_pasakas/lietuviesu-pasakas/kakens-un-gailitis/ | |
http://www.pasakas.net/pasakas/citu_tautu_pasakas/lietuviesu-pasakas/kakinauska-kungs/ | |
http://www.pasakas.net/pasakas/citu_tautu_pasakas/lietuviesu-pasakas/kapec-suni-ir-gudri/ | |
http://www.pasakas.net/pasakas/citu_tautu_pasakas/lietuviesu-pasakas/karalmeita-un-azitis/ | |
http://www.pasakas.net/pasakas/citu_tautu_pasakas/lietuviesu-pasakas/kazina-mele/ | |
http://www.pasakas.net/pasakas/citu_tautu_pasakas/lietuviesu-pasakas/krietnais-puisis/ | |
http://www.pasakas.net/pasakas/citu_tautu_pasakas/lietuviesu-pasakas/lupstainitis/ | |
http://www.pasakas.net/pasakas/citu_tautu_pasakas/lietuviesu-pasakas/maizite/ | |
http://www.pasakas.net/pasakas/citu_tautu_pasakas/lietuviesu-pasakas/par-dzelzs-vilku/ | |
http://www.pasakas.net/pasakas/citu_tautu_pasakas/lietuviesu-pasakas/par-kadu-viru-sauli-menesi-un-veju/ | |
http://www.pasakas.net/pasakas/citu_tautu_pasakas/lietuviesu-pasakas/pirts-dievam/ | |
http://www.pasakas.net/pasakas/citu_tautu_pasakas/lietuviesu-pasakas/sala-davanas/ | |
http://www.pasakas.net/pasakas/citu_tautu_pasakas/lietuviesu-pasakas/tris-brali/ | |
http://www.pasakas.net/pasakas/citu_tautu_pasakas/lietuviesu-pasakas/tris-brali-i/ | |
http://www.pasakas.net/pasakas/citu_tautu_pasakas/lietuviesu-pasakas/tuksa-muiza/ | |
http://www.pasakas.net/pasakas/citu_tautu_pasakas/lietuviesu-pasakas/vislielakais-mulkis/ | |
http://www.pasakas.net/pasakas/citu_tautu_pasakas/lietuviesu-pasakas/zalksa-sieva/ | |
http://www.pasakas.net/pasakas/citu_tautu_pasakas/lietuviesu-pasakas/zelta-abelite/ | |
http://www.pasakas.net/pasakas/citu_tautu_pasakas/lietuviesu-pasakas/zveru-greksudze/ |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment