A simple script to harvest Tuhonohono metadata and images. Beware of line 32. More: http://fogonwater.com/blog/2013/12/the-colours-of-tuhonohono/
Last active
December 30, 2015 01:09
-
-
Save fogonwater/7753893 to your computer and use it in GitHub Desktop.
Tuhonohono metadata harvester
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import json, os, urllib | |
import requests | |
from bs4 import BeautifulSoup | |
def load_text(fpath): | |
f = open(fpath, 'rt') | |
txt = f.read() | |
print 'Loaded "%s".' % fpath | |
return txt | |
class Harvester: | |
def __init__(self): | |
self.auth = None | |
self.throttle = 1 | |
self.harvest_name = 'UNSPECIFIED' | |
self.setup() | |
def setup(self): pass | |
def get(self, url): | |
r = requests.get(url, auth=self.auth) | |
if r.status_code == 200: return r | |
else: print 'Error: %s.' % r.status_code | |
class TuhonohonoHarvester(Harvester): | |
def setup(self): | |
self.harvest_name = 'tuhonohono' | |
self.report_dir = 'data' | |
def get_days(self): | |
base_url = 'http://tuhonohono.wordpress.com/page' | |
pages = range(1,49) #TODO - don't handcode page ranges!!! :-( | |
for page in pages: | |
url = '%s/%s' % (base_url, page) | |
r = self.get(url) | |
try: soup = BeautifulSoup(r.content) | |
except AttributeError: | |
print 'AttributeError: Skipped %s' % url | |
continue | |
fpath = '%s/%s.html' % (self.report_dir, "%02d" % (page)) | |
with open(fpath, 'w') as f: | |
f.write(r.content) | |
print fpath, 'written.' | |
def parse_pages(self): | |
days = [] | |
for root, dirs, files in os.walk(self.report_dir): | |
for file in files: | |
if not file.endswith(".html"): continue | |
fpath = os.path.join(root, file) | |
html = load_text(fpath) | |
soup = BeautifulSoup(html) | |
posts = soup.find_all('article', class_='post') | |
for post in posts: | |
day = {} | |
title = post.find('h1') | |
images = post.find_all('img') | |
img1 = images[0] | |
img2 = images[1] | |
day['title'] = title.text | |
day['url'] = title.a.get('href') | |
day['img1'] = img1['src'] | |
day['img2'] = img2['src'] | |
days.append(day) | |
days.reverse() | |
for i, day in enumerate(days): | |
day_id = i + 1 | |
day['id'] = '%03d' % day_id | |
imgurl = "img/%s.jpg" % day['id'] | |
if os.path.isfile(imgurl): continue | |
urllib.urlretrieve(day['img1'], imgurl) | |
print 'GOT:', day['id'], day['img1'] | |
with open('report.json', 'w') as f: | |
json.dump(days, f, indent=2, sort_keys=True) | |
def main(): | |
harvester = TuhonohonoHarvester() | |
harvester.get_days() | |
harvester.parse_pages() | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment