Skip to content

Instantly share code, notes, and snippets.

@tgittos
Last active August 29, 2015 14:01
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save tgittos/869d09f147c4d79e26d8 to your computer and use it in GitHub Desktop.
Save tgittos/869d09f147c4d79e26d8 to your computer and use it in GitHub Desktop.
deviantArt RSS scraper
#! /usr/bin/env python
import sys
import feedparser
import urllib
import os
import pickle
import time
import datetime
class Scraper:
def update(self):
raise Exception("Abstract method called")
def fetch_all(self):
raise Exception("Abstract method called")
def fetch_new(self, date):
raise Exception("Abstract method called")
class DeviantArtScraper(Scraper):
FEEDS_HISTORY = os.path.join(os.path.abspath('.'), "feeds.db")
def __init__(self):
self.__feed_config = self.__read_config()
def update(self):
for url in self.__feed_config:
print "Checking feed " + url
print "Feed last checked on " + datetime.datetime.fromtimestamp(time.mktime(self.__feed_config[url]['updated'])).strftime("%c")
self.__download(url)
self.__update_config(url)
print ""
self.__write_config(self.__feed_config)
def fetch_all(self, feed_url, destination_folder):
self.__feed_url = feed_url
self.__output = destination_folder
if not feed_url in self.__feed_config:
self.__feed_config[feed_url] = {'path': os.path.abspath(self.__output)}
print "Checking feed " + feed_url
if "updated" in self.__feed_config[feed_url]:
print "Feed last checked on " + datetime.datetime.fromtimestamp(time.mktime(self.__feed_config[feed_url]['updated'])).strftime("%c")
self.__download(self.__feed_url)
self.__update_config(self.__feed_url)
self.__write_config(self.__feed_config)
print "done!"
def __download(self, feed_url):
rss = feedparser.parse(feed_url)
print "Latest item in feed updated " + datetime.datetime.fromtimestamp(time.mktime(rss['entries'][0]['published_parsed'])).strftime("%c")
for entry in rss['entries']:
if feed_url in self.__feed_config and 'updated' in self.__feed_config[feed_url] and entry['published_parsed'] <= self.__feed_config[feed_url]['updated']:
print "Done processing new entries"
return
if not 'media_content' in entry:
print "No image URL for entry " + entry['title'] + ", skipping"
continue
url = entry['media_content'][0]['url']
parts = url.split('/')
filename = os.path.join(self.__feed_config[self.__feed_url]['path'], parts[len(parts)-1])
if not os.path.isfile(filename):
print "Saving " + url + " to " + filename
try:
urllib.urlretrieve(url, filename)
except:
print "Error downloading " + url
pass
else:
print "File " + filename + " already exists, skipping"
next_links = [link['href'] for link in rss['feed']['links'] if link['rel'] == "next"]
if len(next_links) > 0:
self.__download(next_links[0])
def __update_config(self, feed):
self.__feed_config[feed]['updated'] = time.gmtime()
def __read_config(self):
config = {}
if os.path.isfile(self.FEEDS_HISTORY):
with open(self.FEEDS_HISTORY, 'r') as f:
config = pickle.loads(f.read())
return config
def __write_config(self, config):
with open(self.FEEDS_HISTORY, 'w') as f:
f.write(pickle.dumps(config))
da = DeviantArtScraper()
if (sys.argv[1] == "update"):
da.update()
else:
feed = sys.argv[1]
output = sys.argv[2]
da.fetch_all(feed, output)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment