Skip to content

Instantly share code, notes, and snippets.

Created May 25, 2015 23:22
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save anonymous/7f9f4b2168a2c5fe5374 to your computer and use it in GitHub Desktop.
Save anonymous/7f9f4b2168a2c5fe5374 to your computer and use it in GitHub Desktop.
## run python2.? on this
wrapper="""
#!/bin/bash
# change this to match your system and preferences
export TUMBLR_BASE_DIR=~/tumblr
cd ${TUMBLR_BASE_DIR}
# git is a cheap way to keep backups, so lets use it
# remove these two lines if you dont want them
git add scraped
git commit -m "scraped backup"
# run the webserver
python chrome-bookmarks.py &
# run the live scraper
python livescraper.py &
"""
server="""
import json
from bottle import run, route, redirect
import jinja2
import random
urllist = []
count = 0
verbose = True
@route("/")
def index() :
return random_url()
urlfile = os.path.join(os.getenv("TUMBLR_BASE_DIR", "scraped")
@route("/random")
def random_url() :
global count
count += 1
if count % 5 : # reload every so often
get_bookmarks()
url = random.choice(urllist)
print url
redirect (url)
def get_bookmarks() :
global urllist
with open(urlfile) as bf :
urllist = bf.readlines()
if __name__ == "__main__" :
get_bookmarks()
run(host='0.0.0.0', port=8080, debug=True)
"""
livescraper="""
#!/usr/bin/python
import BeautifulSoup
import re
import sys
import shutil
import json
import glob
import hashlib
import os
from watchdog.events import FileSystemEventHandler
from watchdog.observers import Observer
tumblrpat = re.compile("(?P<base>http://[^/]*.tumblr.com/)")
verbose = True
oldies = None
bookmark_dir = os.getenv("TUMBLR_BASE_DIR")
os.chdir(bookmark_dir)
class MyEventHandler(FileSystemEventHandler):
def __init__(self, observer, filename):
self.observer = observer
self.filename = filename
def on_created(self, event):
print "e=", event
if not event.is_directory :
if event.src_path.endswith(".html") :
print("got a new file {}".format(event.src_path))
scrape(event.src_path)
def main() :
global oldies
# not needed if you don't care about backing up your scraped file
if True:
shutil.copyfile("scraped.backup.2", "scraped.backup.3")
shutil.copyfile("scraped.backup.1", "scraped.backup.2")
shutil.copyfile("scraped.backup", "scraped.backup.1")
shutil.copyfile("scraped", "scraped.backup")
oldies = set([i.strip() for i in open("scraped").readlines()])
print len(oldies)
observer = Observer()
observer.start()
event_handler = MyEventHandler(observer, bookmarkdir)
observer.schedule(event_handler, bookmarkdir, recursive=False)
observer.join()
def scrape(newpage):
if verbose:
print "about to scrape ", newpage
len1 = len(oldies)
oldlen = len(oldies)
if not os.path.exists(newpage) :
return # event seems to fire more than once?
page = open(newpage)
doc = BeautifulSoup.BeautifulSoup(page)
for link in doc.findAll('a') :
url = link.get('href')
if verbose :
print url
if url :
if 'avatar' not in url :
if 'tumblr' in url :
if url.endswith('/') : # skip images
oldies.add(url)
else :
# but see if you can get a base tumblr url from it
mo = tumblrpat.match(url)
if mo :
if verbose :
print "adding base url from image", url, mo.group("base")
oldies.add(mo.group("base"))
print "oldies now has {} for an increase of {}".format(len(oldies), len(oldies)-len1)
try :
h = hashlib.sha1()
h.update(open(newpage).read())
root = h.hexdigest()
os.rename(newpage, "save/" + root)
if verbose :
print "bzip2 save/{}".format(root)
os.system("bzip2 save/{}".format(root))
except Exception,e:
print "Failed to move {} to save".format(newpage)
print "exception was ", e
pass
if len(oldies) < oldlen :
print "Oops, something went wrong here."
return
with open("scraped", "w") as newf:
for i in oldies :
newf.write(i + "\n")
print "scraped len went from {} to {} for an increase of {}".format(oldlen, len(oldies), len(oldies)-oldlen)
main()
"""
open("wrapper", "w").write(wrapper)
open("livescraper", "w").write(livescraper)
open("server", "w").write(server)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment