Navigation Menu

Skip to content

Instantly share code, notes, and snippets.

@spudtrooper
Created October 24, 2012 04:55
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save spudtrooper/3943983 to your computer and use it in GitHub Desktop.
Save spudtrooper/3943983 to your computer and use it in GitHub Desktop.
American Voices
import os, re, string, sys, urllib2
from BeautifulSoup import BeautifulSoup
LAST_URL_FILE = 'lastUrl.txt'
def log(msg):
print >> sys.stderr, msg
def saveLastUrl(url):
with open(LAST_URL_FILE, 'w') as f:
f.write(url)
def getLastUrl():
file = LAST_URL_FILE
if not os.path.exists(file):
return None
with open(file, 'r') as f:
return f.read()
return None
class AmericanVoices:
def main(self, url):
self.loop(url)
def loop(self, url):
log(url)
page = urllib2.urlopen(url)
imgs = []
soup = BeautifulSoup(page)
for div in soup('div', {'class': 'image'}):
for img in div.findAll('img'):
src = str(img['src'])
if re.match('^http:\/\/', src):
imgs.append(src)
names = []
jobs = []
try:
for p in soup('p', {'class': 'occupation'}):
name,br,job = [str(x).strip() for x in p.contents]
names.append(name)
jobs.append(job)
# Write out the values
for img,name,job in zip(imgs, names, jobs):
print '|'.join([img,name,job,url])
sys.stdout.flush()
saveLastUrl(url)
except:
pass
# Find the next link
for li in soup('li', {'class': 'previous'}):
a = li.find('a')
newUrl = 'http://www.theonion.com' + a['href']
self.loop(newUrl)
def main(argv):
prog = argv.pop(0)
if len(argv) > 0:
url = argv.pop(0)
else:
url = getLastUrl()
if not url:
url = 'http://www.theonion.com/articles/apple-unveils-ipad-mini,30068/'
AmericanVoices().main(url)
if __name__ == '__main__':
main(sys.argv)
import fileinput, sys
from collections import OrderedDict
class Info:
def __init__(self,name,job,url):
self.name = name
self.job = job
self.url = url
def main(argv):
prog = argv.pop(0)
imgs2infos = {}
for line in fileinput.input():
line = line.strip()
img,name,job,url = line.split('|')
info = Info(name,job,url)
lst = imgs2infos.get(img)
if not lst:
lst = []
lst.append(info)
imgs2infos[img] = lst
print '<html>'
print '<head>'
print '<style>'
print '.m li {'
print ' display: inline;'
print ' float: left;'
print ' padding: 10px'
print '}'
print '</style>'
print '</head>'
print '<body>'
print '<h1>The Onion - American Voices</h1>'
print '<ul class="m" style="overflow:both; width:%dpx">' % (len(imgs2infos) * 300)
odd = True
sortedImgs2Infos = OrderedDict(sorted(imgs2infos.items(),
key=lambda x: -len(x[1])))
for img,infos in sortedImgs2Infos.iteritems():
print '<li>'
print '<img src="%s"</img>' % (img)
print '<br/>'
for info in infos:
print '<a href="%s">%s</a> - %s' % (info.url, info.name, info.job)
print '<br/>'
print '</li>'
print '</body>'
print '</html>'
if __name__ == '__main__':
main(sys.argv)
all: americanVoices.html
gather:
python americanVoices.py >> results.txt
americanVoices.html: results.txt
sort $< | uniq | python genHtml.py > $@
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment