Skip to content

Instantly share code, notes, and snippets.

@lulalala
Created January 21, 2016 16:08
Show Gist options
  • Save lulalala/d8c4442bec10b8d3cc79 to your computer and use it in GitHub Desktop.
Save lulalala/d8c4442bec10b8d3cc79 to your computer and use it in GitHub Desktop.
require 'nokogiri'
require 'open-uri'
(1..91).each do |i|
f = Fountain.create(issue:i, title:i)
end
(1..91).each do |i|
f = Fountain.find_by(issue:i)
i_s = "%02d" % i
j = 0
while(true) do
j += 1
j_s = "%02d" % j
url = "http://www.catholic.org.tw/ccsc/book/ind#{i_s}#{j_s}.htm"
puts url
begin
doc = Nokogiri.HTML(open(url))
rescue OpenURI::HTTPError
break
end
body = doc.css("body>table table tr:nth-child(4) td").inner_html
author = doc.css("body>table table tr:nth-child(4) td div:first-child").text.gsub("■","").strip
title = doc.css("body>table table tr:nth-child(3) td marquee").text.gsub("■","").strip
p = Post.new
p.body = body
p.title = title
p.author = author
begin
Post.transaction do
p.save!
FountainEntry.create!(fountain_id: f.id, post_id: p.id)
end
rescue
end
end
end
Post.all.each do |p|
doc = Nokogiri::HTML.fragment(p.body)
doc.search('div:first-child[align=right]').each do |src|
src.remove
end
p.body = doc.to_s.strip
p.save
end
def is_blank?(node)
(node.text? && node.content.strip.blank?) || (node.element? && node.name == 'br')
end
def all_children_are_blank?(node)
node.children.all?{|child| is_blank?(child) }
# Here you see the convenience of monkeypatching... sometimes.
end
Post.all.each do |p|
doc = Nokogiri::HTML.fragment(p.body)
doc.css('p:first-child').find_all{|p| all_children_are_blank?(p) }.each do |p|
p.remove
end
p.body = doc.to_s.strip
p.save
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment