Skip to content

Instantly share code, notes, and snippets.

@fogonwater
Last active December 30, 2015 01:09
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save fogonwater/7753893 to your computer and use it in GitHub Desktop.
Save fogonwater/7753893 to your computer and use it in GitHub Desktop.
Tuhonohono metadata harvester
import json, os, urllib
import requests
from bs4 import BeautifulSoup
def load_text(fpath):
f = open(fpath, 'rt')
txt = f.read()
print 'Loaded "%s".' % fpath
return txt
class Harvester:
def __init__(self):
self.auth = None
self.throttle = 1
self.harvest_name = 'UNSPECIFIED'
self.setup()
def setup(self): pass
def get(self, url):
r = requests.get(url, auth=self.auth)
if r.status_code == 200: return r
else: print 'Error: %s.' % r.status_code
class TuhonohonoHarvester(Harvester):
def setup(self):
self.harvest_name = 'tuhonohono'
self.report_dir = 'data'
def get_days(self):
base_url = 'http://tuhonohono.wordpress.com/page'
pages = range(1,49) #TODO - don't handcode page ranges!!! :-(
for page in pages:
url = '%s/%s' % (base_url, page)
r = self.get(url)
try: soup = BeautifulSoup(r.content)
except AttributeError:
print 'AttributeError: Skipped %s' % url
continue
fpath = '%s/%s.html' % (self.report_dir, "%02d" % (page))
with open(fpath, 'w') as f:
f.write(r.content)
print fpath, 'written.'
def parse_pages(self):
days = []
for root, dirs, files in os.walk(self.report_dir):
for file in files:
if not file.endswith(".html"): continue
fpath = os.path.join(root, file)
html = load_text(fpath)
soup = BeautifulSoup(html)
posts = soup.find_all('article', class_='post')
for post in posts:
day = {}
title = post.find('h1')
images = post.find_all('img')
img1 = images[0]
img2 = images[1]
day['title'] = title.text
day['url'] = title.a.get('href')
day['img1'] = img1['src']
day['img2'] = img2['src']
days.append(day)
days.reverse()
for i, day in enumerate(days):
day_id = i + 1
day['id'] = '%03d' % day_id
imgurl = "img/%s.jpg" % day['id']
if os.path.isfile(imgurl): continue
urllib.urlretrieve(day['img1'], imgurl)
print 'GOT:', day['id'], day['img1']
with open('report.json', 'w') as f:
json.dump(days, f, indent=2, sort_keys=True)
def main():
harvester = TuhonohonoHarvester()
harvester.get_days()
harvester.parse_pages()
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment