Created
October 24, 2012 04:55
-
-
Save spudtrooper/3943983 to your computer and use it in GitHub Desktop.
American Voices
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os, re, string, sys, urllib2 | |
from BeautifulSoup import BeautifulSoup | |
LAST_URL_FILE = 'lastUrl.txt' | |
def log(msg): | |
print >> sys.stderr, msg | |
def saveLastUrl(url): | |
with open(LAST_URL_FILE, 'w') as f: | |
f.write(url) | |
def getLastUrl(): | |
file = LAST_URL_FILE | |
if not os.path.exists(file): | |
return None | |
with open(file, 'r') as f: | |
return f.read() | |
return None | |
class AmericanVoices: | |
def main(self, url): | |
self.loop(url) | |
def loop(self, url): | |
log(url) | |
page = urllib2.urlopen(url) | |
imgs = [] | |
soup = BeautifulSoup(page) | |
for div in soup('div', {'class': 'image'}): | |
for img in div.findAll('img'): | |
src = str(img['src']) | |
if re.match('^http:\/\/', src): | |
imgs.append(src) | |
names = [] | |
jobs = [] | |
try: | |
for p in soup('p', {'class': 'occupation'}): | |
name,br,job = [str(x).strip() for x in p.contents] | |
names.append(name) | |
jobs.append(job) | |
# Write out the values | |
for img,name,job in zip(imgs, names, jobs): | |
print '|'.join([img,name,job,url]) | |
sys.stdout.flush() | |
saveLastUrl(url) | |
except: | |
pass | |
# Find the next link | |
for li in soup('li', {'class': 'previous'}): | |
a = li.find('a') | |
newUrl = 'http://www.theonion.com' + a['href'] | |
self.loop(newUrl) | |
def main(argv): | |
prog = argv.pop(0) | |
if len(argv) > 0: | |
url = argv.pop(0) | |
else: | |
url = getLastUrl() | |
if not url: | |
url = 'http://www.theonion.com/articles/apple-unveils-ipad-mini,30068/' | |
AmericanVoices().main(url) | |
if __name__ == '__main__': | |
main(sys.argv) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import fileinput, sys | |
from collections import OrderedDict | |
class Info: | |
def __init__(self,name,job,url): | |
self.name = name | |
self.job = job | |
self.url = url | |
def main(argv): | |
prog = argv.pop(0) | |
imgs2infos = {} | |
for line in fileinput.input(): | |
line = line.strip() | |
img,name,job,url = line.split('|') | |
info = Info(name,job,url) | |
lst = imgs2infos.get(img) | |
if not lst: | |
lst = [] | |
lst.append(info) | |
imgs2infos[img] = lst | |
print '<html>' | |
print '<head>' | |
print '<style>' | |
print '.m li {' | |
print ' display: inline;' | |
print ' float: left;' | |
print ' padding: 10px' | |
print '}' | |
print '</style>' | |
print '</head>' | |
print '<body>' | |
print '<h1>The Onion - American Voices</h1>' | |
print '<ul class="m" style="overflow:both; width:%dpx">' % (len(imgs2infos) * 300) | |
odd = True | |
sortedImgs2Infos = OrderedDict(sorted(imgs2infos.items(), | |
key=lambda x: -len(x[1]))) | |
for img,infos in sortedImgs2Infos.iteritems(): | |
print '<li>' | |
print '<img src="%s"</img>' % (img) | |
print '<br/>' | |
for info in infos: | |
print '<a href="%s">%s</a> - %s' % (info.url, info.name, info.job) | |
print '<br/>' | |
print '</li>' | |
print '</body>' | |
print '</html>' | |
if __name__ == '__main__': | |
main(sys.argv) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
all: americanVoices.html | |
gather: | |
python americanVoices.py >> results.txt | |
americanVoices.html: results.txt | |
sort $< | uniq | python genHtml.py > $@ |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment