Skip to content

Instantly share code, notes, and snippets.

@hcolomb
Created October 19, 2014 23:25
Show Gist options
  • Save hcolomb/f70d1233c8a2b849d0a1 to your computer and use it in GitHub Desktop.
Save hcolomb/f70d1233c8a2b849d0a1 to your computer and use it in GitHub Desktop.
require 'nokogiri'
require 'open-uri'
require 'rss'
def read_list()
f = File.open("feeds.txt", "r")
urls = f.readlines
f.close
urls.each do |url|
begin
scrape_site(url)
rescue
puts "Scrape of #{url} failed."
end
end
end
def scrape_site(site)
feed = parse_feed(site)
posts = find_posts(feed)
posts.each do |p|
puts "Scraping #{posts.index(p)} of #{posts.length}."
scrape_post(p)
end
end
def parse_feed(url)
puts "Parsing #{url}"
RSS::Parser.parse(open("#{url}", :redirect => true))
end
def find_posts(feed)
posts = feed.items
posts.reject! do |p|
Time.at(p.date) <= (Time.now - 600)
end
puts "Found #{posts.length} posts."
return posts
end
def scrape_post(post)
puts "Scraping #{post.link}"
doc = open_site(post.link)
images = find_images(doc)
title = find_title(doc)
list = find_links(images)
unless list.empty?
write_file(list, title, post.link)
end
end
def open_site(site)
Nokogiri::HTML(open(site))
end
def find_images(doc)
doc.css('span[class = "imageblock"] > span > img')
end
def find_title(doc)
doc.css('title').text
end
def find_links(images)
links = []
images.each do |image|
links << image['src']
end
return links
end
def write_file(list, title, link)
time = "#{Time.now.year}-#{Time.now.month}-#{Time.now.day}-#{Time.now.hour}#{Time.now.min}_#{Time.now.sec}"
File.open("/home/hcolomb/tistory-scraper/site/_posts/#{time}.md", "w+") do |f|
f.puts("---")
f.puts("layout: post")
f.puts("title: \"#{title}\"")
f.puts("---\n")
if (list[0] =~ /original/)
list.each do |link|
f.puts("[![#{link.gsub(/original/, "image")}](#{link.gsub(/original/, "image")})](#{link})")
end
else
list.each do |link|
f.puts("[![#{link}](#{link})](#{link.gsub(/image/, "original")})")
end
end
f.puts("Source: #{link}")
puts "Saving as #{time}.md"
end
end
while true do
puts Time.now.localtime("-04:00")
read_list
puts "Sleeping........................\n\n"
sleep 601
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment