Created
May 25, 2015 23:22
-
-
Save anonymous/7f9f4b2168a2c5fe5374 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
## run python2.? on this | |
wrapper=""" | |
#!/bin/bash | |
# change this to match your system and preferences | |
export TUMBLR_BASE_DIR=~/tumblr | |
cd ${TUMBLR_BASE_DIR} | |
# git is a cheap way to keep backups, so lets use it | |
# remove these two lines if you dont want them | |
git add scraped | |
git commit -m "scraped backup" | |
# run the webserver | |
python chrome-bookmarks.py & | |
# run the live scraper | |
python livescraper.py & | |
""" | |
server=""" | |
import json | |
from bottle import run, route, redirect | |
import jinja2 | |
import random | |
urllist = [] | |
count = 0 | |
verbose = True | |
@route("/") | |
def index() : | |
return random_url() | |
urlfile = os.path.join(os.getenv("TUMBLR_BASE_DIR", "scraped") | |
@route("/random") | |
def random_url() : | |
global count | |
count += 1 | |
if count % 5 : # reload every so often | |
get_bookmarks() | |
url = random.choice(urllist) | |
print url | |
redirect (url) | |
def get_bookmarks() : | |
global urllist | |
with open(urlfile) as bf : | |
urllist = bf.readlines() | |
if __name__ == "__main__" : | |
get_bookmarks() | |
run(host='0.0.0.0', port=8080, debug=True) | |
""" | |
livescraper=""" | |
#!/usr/bin/python | |
import BeautifulSoup | |
import re | |
import sys | |
import shutil | |
import json | |
import glob | |
import hashlib | |
import os | |
from watchdog.events import FileSystemEventHandler | |
from watchdog.observers import Observer | |
tumblrpat = re.compile("(?P<base>http://[^/]*.tumblr.com/)") | |
verbose = True | |
oldies = None | |
bookmark_dir = os.getenv("TUMBLR_BASE_DIR") | |
os.chdir(bookmark_dir) | |
class MyEventHandler(FileSystemEventHandler): | |
def __init__(self, observer, filename): | |
self.observer = observer | |
self.filename = filename | |
def on_created(self, event): | |
print "e=", event | |
if not event.is_directory : | |
if event.src_path.endswith(".html") : | |
print("got a new file {}".format(event.src_path)) | |
scrape(event.src_path) | |
def main() : | |
global oldies | |
# not needed if you don't care about backing up your scraped file | |
if True: | |
shutil.copyfile("scraped.backup.2", "scraped.backup.3") | |
shutil.copyfile("scraped.backup.1", "scraped.backup.2") | |
shutil.copyfile("scraped.backup", "scraped.backup.1") | |
shutil.copyfile("scraped", "scraped.backup") | |
oldies = set([i.strip() for i in open("scraped").readlines()]) | |
print len(oldies) | |
observer = Observer() | |
observer.start() | |
event_handler = MyEventHandler(observer, bookmarkdir) | |
observer.schedule(event_handler, bookmarkdir, recursive=False) | |
observer.join() | |
def scrape(newpage): | |
if verbose: | |
print "about to scrape ", newpage | |
len1 = len(oldies) | |
oldlen = len(oldies) | |
if not os.path.exists(newpage) : | |
return # event seems to fire more than once? | |
page = open(newpage) | |
doc = BeautifulSoup.BeautifulSoup(page) | |
for link in doc.findAll('a') : | |
url = link.get('href') | |
if verbose : | |
print url | |
if url : | |
if 'avatar' not in url : | |
if 'tumblr' in url : | |
if url.endswith('/') : # skip images | |
oldies.add(url) | |
else : | |
# but see if you can get a base tumblr url from it | |
mo = tumblrpat.match(url) | |
if mo : | |
if verbose : | |
print "adding base url from image", url, mo.group("base") | |
oldies.add(mo.group("base")) | |
print "oldies now has {} for an increase of {}".format(len(oldies), len(oldies)-len1) | |
try : | |
h = hashlib.sha1() | |
h.update(open(newpage).read()) | |
root = h.hexdigest() | |
os.rename(newpage, "save/" + root) | |
if verbose : | |
print "bzip2 save/{}".format(root) | |
os.system("bzip2 save/{}".format(root)) | |
except Exception,e: | |
print "Failed to move {} to save".format(newpage) | |
print "exception was ", e | |
pass | |
if len(oldies) < oldlen : | |
print "Oops, something went wrong here." | |
return | |
with open("scraped", "w") as newf: | |
for i in oldies : | |
newf.write(i + "\n") | |
print "scraped len went from {} to {} for an increase of {}".format(oldlen, len(oldies), len(oldies)-oldlen) | |
main() | |
""" | |
open("wrapper", "w").write(wrapper) | |
open("livescraper", "w").write(livescraper) | |
open("server", "w").write(server) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment