oiva/books.py

## books.py
#!/usr/bin/python
# -*- coding: UTF-8 -*-

import codecs
from datetime import datetime
import feedparser
import sys
import re

sys.stdout = codecs.getwriter('utf-8')(sys.stdout)


def parse_books():
    # url = 'http://feeds.5by5.tv/b2w'
    filename = './b2w.xml'
    pattern = '<a .*? href="(http:\/\/www\.amazon\.com[^"]*).*?>(.*?)</a>'
    pattern2 = '<a .*? href="((?!http:\/\/www\.amazon).*?)" .*?>((?:Audio)?\
Book: .*?)</a>'
    skip = ['Health &amp; Personal Care', 'Toys &amp; Games', 'MP3 Downloads',
            'Computers &amp; Accessories', 'Musical Instruments', 'Moleskine',
            'Everything Else', 'Music', 'Electronics', 'Movies &amp; TV',
            'Sports &amp; Outdoors', 'Grocery &amp; Gourmet Food', ': Baby',
            'Amazon Instant Video', 'Kitchen &amp; Dining', 'Floor Lamp',
            'Wishlist', 'The Aviator', 'Home Improvement', 'Video Games',
            'Fingernail Clipper', 'Edimax N150 Wireless', 'Automotive',
            'ASUS Dual-Band Wireless', 'Camera &amp; Photo', 'Beauty',
            'Office Products', 'Crafts & Sewing', 'Patio, Lawn &amp; Garden',
            'Home &amp; Kitchen', 'Brass No Soliciting Sign', 'Light Bulb',
            'Ultra Pro Resealable Current Size Comic Bags', 'Model Rocket Kit',
            'Arts, Crafts &amp; Sewing']
    regex = re.compile(pattern, re.IGNORECASE)
    regex2 = re.compile(pattern2, re.IGNORECASE)
    books = []

    feed = feedparser.parse(filename)

    for episode in feed.entries:
        links = regex.findall(episode.content[0].value)
        non_amazon = regex2.findall(episode.content[0].value)
        links += non_amazon

        for link in links:
            # skip items that are not really books
            is_book = True
            for category in skip:
                if category in link[1]:
                    is_book = False
            if not is_book:
                continue

            # include episode info
            link += (episode.link, episode.title)

            # no duplicates
            if link not in books:
                books.append(link)
    return books


def produce_list(books):
    filename = './index.tmpl'
    filtered = [';Book: ', 'BOOK: ', ': Books', ':Books', '[Amazon]',
                'Amazon.com: Boo', 'Amazon: ', 'Amazon.com: ', ': Amazon.com',
                ' at Amazon.com', ':Amazon', '(Amazon.com)', '(Amazon)',
                ': Explore similar items', ' - Amazon.com', 'Kindle Store',
                ' (HIGHLY recommended by Merlin)']
    comics = ['Marvel Famous Firsts', 'Marvel Now', 'Thor:', 'X-Men',
              'Hawkeye', 'American Vampire', 'The Walking Dead', 'Watchmen',
              'X-Force', 'Daredevil', 'Spider-Man', 'Scarlet', 'She-Hulk',
              'Fantastic Four', 'Wolverine', 'Fiona Staples', 'Civil War',
              'Brian Michael Bendis', 'Marvels', 'Batman', 'Deadpool',
              'Avengers', 'Animal Man', 'Transmetropolitan', 'Volume 1',
              'Y: The Last Man', 'Zita the Spacegirl', 'Invincible:',
              '5 Ronin', 'Runaways', 'The Immortal Iron Fist', 'Superman',
              'The Wonderful Wizard of Oz', 'World War Hulk',
              'Incredible Hulk', 'Infinity Gauntlet', 'Punk Rock Jesus']

    booklist = comiclist = ''
    bookcount = comiccount = authorcount = gtd = 0

    for book in books:
        (link, title, episodeLink, episodeTitle) = book

        # filter out "Amazon.com" and similar things from the title
        for word in filtered:
            title = title.replace(word, '')
        title = re.sub(r'^(Audiobook|Book):', '', title)
        title = title.replace('&#x27;', '\'')

        # one stupid link in ep 72
        if title == 'Amazon':
            title = 'The Now Habit: A Strategic Program for Overcoming\
                     Procrastination and Enjoying Guilt-Free Play: \
                     Neil Fiore'

        iscomic = False
        for comic in comics:
            if comic in title:
                iscomic = True
                break

        # try to guess author from title
        (author, title) = get_author(title)

        # parse episode number for sorting
        episode = episodeTitle[:episodeTitle.find(':')]

        row = '\t\t\t<tr>\n\t\t\t\t<td><a href="%s">%s</a></td>\n\
                    <td>%s</td>\
                    <td data-value="%s"><a href="%s">%s</a></td>\n\
                    \t\t\t</tr>\n'\
                    % (link, title, author, episode, episodeLink, episodeTitle)

        if iscomic:
            comiclist += row
            comiccount += 1
        else:
            booklist += row
            bookcount += 1
        if author != '':
            authorcount += 1
        if 'Getting Things Done' in title:
            gtd += 1

    with codecs.open(filename, 'r', 'utf-8') as template:
        html = template.read()
        html = html.replace('{tablebody}', booklist)
        html = html.replace('{bookcount}', str(bookcount))
        html = html.replace('{comicbody}', comiclist)
        html = html.replace('{comiccount}', str(comiccount))
        html = html.replace('{date}', "{:%Y-%m-%d}".format(datetime.now()))
        html = html.replace('{gtd}', str(gtd))

    if not html:
        print 'reading template from %s failed' % filename
        return

    with codecs.open(filename.replace('tmpl', 'html'), 'w', 'utf-8') as file:
        file.write(html)

    return (bookcount, comiccount, authorcount)


def get_author(title):
    # remove (9123912392925) from title
    title = re.sub(r'\(?[0-9]{6,}\)?', '', title)

    # split title into title and author. Usually separated by ':'.
    if title.count(':') == title.count(' - ') and title.count(':') > 0:
        parts = title.split(' - ')
    elif ':' in title:
        parts = title.split(':')
        # remove empty parts
        parts = filter(lambda title: title.strip(), parts)
    elif ' by ' in title:
        parts = title.split(' by ')
    else:
        parts = title.split(' - ')

    # take last part of title
    part = parts.pop()

    if len(parts) >= 1:
        # try to guess if string is either a list of authors or just part of
        # the title
        author = part.strip(': ')
        author = author.replace(' and ', ', ').replace(',,', ',')

        # remove middle name initials for easier heuristics about name
        simpleauthor = re.sub(r'\s[A-Z]\.', '', author)
        simpleauthor = simpleauthor.replace(' MSPT', '').strip()

        authorparts = len(simpleauthor.split(' '))
        commas = simpleauthor.count(',')

        # example: Andrew Hunt, David Thomas -> 4 names <= (1 comma + 1) * 2
        if authorparts > 3 and authorparts > (commas + 1) * 2 + 1:
            author = ''
        # more than three words without commas
        elif authorparts > 3 and commas == 0:
            author = ''

        if author != '':
            title = ': '.join(parts)
    else:
        author = ''

    # cleanup
    title = title.strip(' :')
    author = re.sub(r',([^\s])', r', \1', author)

    return (author, title)

books = parse_books()
count = produce_list(books)
print "found %d books and %d comics. %d authors found." % count

## index.tmpl
<!DOCTYPE html>
<html>
<head>
    <title>Back to Work Reading List</title>

    <meta name="viewport" content="width=device-width, initial-scale=1">

    <link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.2/css/bootstrap.min.css">
    <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/sortable/0.6.0/css/sortable-theme-bootstrap.css">
    <style type="text/css" media="screen">
        footer {
            border-top: 1px solid #e0e0e0;
            margin-top: 30px;
            padding-top: 10px;
        }
        .table-hover>tbody>tr:hover {
            background-color: #f0f6ff;
        }
    </style>
</head>
<body>
    <div class="container-fluid">

        <div class="page-header row">
            <div class="col-xs-12 col-md-6">
                <h1>Back to Work Reading List</h1>
                <p><a href="http://5by5.tv/b2w">Back to Work</a> is a podcast about <a href="#comics">comics</a>
                    and <a href="#books">productivity books</a>.
                    This is a list of the show's recommended reading, as gathered from show notes.
                </p>
                <p>Some errors and omissions are possible because the list is in fact compiled from RSS by a robot.
                   Most of the links go to amazon.com and those are 5by5's affiliate links.
                </p>
            </div>
        </div>

        <div class="books">
            <a name="books"></a>
            <h2>Books</h2>
            <p>A total of {bookcount} books have been recommended so far. {gtd} of them are
                <a href="http://www.amazon.com/exec/obidos/ASIN/0142000280/5by5-20">Getting Things Done</a>,
                ©DavidCo 2001.</p>

                <table class="table table-hover table-condensed sortable-theme-bootstrap" data-sortable>
                    <thead>
                        <tr>
                            <th>Title</th>
                            <th>Author</th>
                            <th>Episode</th>
                        </tr>
                    </thead>
                    <tbody>
            {tablebody}
                    </tbody>
                </table>

        </div>

        <div class="comics">
            <a name="comics"></a>
            <h2>Comics</h2>
            <p>A total of {comiccount} comics have been recommended.</p>

                <table class="table table-hover table-condensed sortable-theme-bootstrap" data-sortable>
                    <thead>
                        <tr>
                            <th>Title</th>
                            <th>Author</th>
                            <th>Episode</th>
                        </tr>
                    </thead>
                    <tbody>
            {comicbody}
                    </tbody>
                </table>

        </div>

        <footer class="footer">
            <p>List created by <a href="https://twitter.com/oiva">Oiva</a>. Updated on {date}.
                Lovingly created with <a href="https://gist.github.com/oiva/400a383e8f271c84d4a5">Python</a>.</p>
        </footer>
    </div>

    <script src="http://cdnjs.cloudflare.com/ajax/libs/sortable/0.6.0/js/sortable.min.js"></script>
</body>
</html>
	#!/usr/bin/python
	# -- coding: UTF-8 --

	import codecs
	from datetime import datetime
	import feedparser
	import sys
	import re

	sys.stdout = codecs.getwriter('utf-8')(sys.stdout)


	def parse_books():
	# url = 'http://feeds.5by5.tv/b2w'
	filename = './b2w.xml'
	pattern = '<a .? href="(http:\/\/www\.amazon\.com[^"]).?>(.?)</a>'
	pattern2 = '<a .? href="((?!http:\/\/www\.amazon).?)" .*?>((?:Audio)?\
	Book: .*?)</a>'
	skip = ['Health & Personal Care', 'Toys & Games', 'MP3 Downloads',
	'Computers & Accessories', 'Musical Instruments', 'Moleskine',
	'Everything Else', 'Music', 'Electronics', 'Movies & TV',
	'Sports & Outdoors', 'Grocery & Gourmet Food', ': Baby',
	'Amazon Instant Video', 'Kitchen & Dining', 'Floor Lamp',
	'Wishlist', 'The Aviator', 'Home Improvement', 'Video Games',
	'Fingernail Clipper', 'Edimax N150 Wireless', 'Automotive',
	'ASUS Dual-Band Wireless', 'Camera & Photo', 'Beauty',
	'Office Products', 'Crafts & Sewing', 'Patio, Lawn & Garden',
	'Home & Kitchen', 'Brass No Soliciting Sign', 'Light Bulb',
	'Ultra Pro Resealable Current Size Comic Bags', 'Model Rocket Kit',
	'Arts, Crafts & Sewing']
	regex = re.compile(pattern, re.IGNORECASE)
	regex2 = re.compile(pattern2, re.IGNORECASE)
	books = []

	feed = feedparser.parse(filename)

	for episode in feed.entries:
	links = regex.findall(episode.content[0].value)
	non_amazon = regex2.findall(episode.content[0].value)
	links += non_amazon

	for link in links:
	# skip items that are not really books
	is_book = True
	for category in skip:
	if category in link[1]:
	is_book = False
	if not is_book:
	continue

	# include episode info
	link += (episode.link, episode.title)

	# no duplicates
	if link not in books:
	books.append(link)
	return books


	def produce_list(books):
	filename = './index.tmpl'
	filtered = [';Book: ', 'BOOK: ', ': Books', ':Books', '[Amazon]',
	'Amazon.com: Boo', 'Amazon: ', 'Amazon.com: ', ': Amazon.com',
	' at Amazon.com', ':Amazon', '(Amazon.com)', '(Amazon)',
	': Explore similar items', ' - Amazon.com', 'Kindle Store',
	' (HIGHLY recommended by Merlin)']
	comics = ['Marvel Famous Firsts', 'Marvel Now', 'Thor:', 'X-Men',
	'Hawkeye', 'American Vampire', 'The Walking Dead', 'Watchmen',
	'X-Force', 'Daredevil', 'Spider-Man', 'Scarlet', 'She-Hulk',
	'Fantastic Four', 'Wolverine', 'Fiona Staples', 'Civil War',
	'Brian Michael Bendis', 'Marvels', 'Batman', 'Deadpool',
	'Avengers', 'Animal Man', 'Transmetropolitan', 'Volume 1',
	'Y: The Last Man', 'Zita the Spacegirl', 'Invincible:',
	'5 Ronin', 'Runaways', 'The Immortal Iron Fist', 'Superman',
	'The Wonderful Wizard of Oz', 'World War Hulk',
	'Incredible Hulk', 'Infinity Gauntlet', 'Punk Rock Jesus']

	booklist = comiclist = ''
	bookcount = comiccount = authorcount = gtd = 0

	for book in books:
	(link, title, episodeLink, episodeTitle) = book

	# filter out "Amazon.com" and similar things from the title
	for word in filtered:
	title = title.replace(word, '')
	title = re.sub(r'^(Audiobook\|Book):', '', title)
	title = title.replace(''', '\'')

	# one stupid link in ep 72
	if title == 'Amazon':
	title = 'The Now Habit: A Strategic Program for Overcoming\
	Procrastination and Enjoying Guilt-Free Play: \
	Neil Fiore'

	iscomic = False
	for comic in comics:
	if comic in title:
	iscomic = True
	break

	# try to guess author from title
	(author, title) = get_author(title)

	# parse episode number for sorting
	episode = episodeTitle[:episodeTitle.find(':')]

	row = '\t\t\t<tr>\n\t\t\t\t<td><a href="%s">%s</a></td>\n\
	<td>%s</td>\
	<td data-value="%s"><a href="%s">%s</a></td>\n\
	\t\t\t</tr>\n'\
	% (link, title, author, episode, episodeLink, episodeTitle)

	if iscomic:
	comiclist += row
	comiccount += 1
	else:
	booklist += row
	bookcount += 1
	if author != '':
	authorcount += 1
	if 'Getting Things Done' in title:
	gtd += 1

	with codecs.open(filename, 'r', 'utf-8') as template:
	html = template.read()
	html = html.replace('{tablebody}', booklist)
	html = html.replace('{bookcount}', str(bookcount))
	html = html.replace('{comicbody}', comiclist)
	html = html.replace('{comiccount}', str(comiccount))
	html = html.replace('{date}', "{:%Y-%m-%d}".format(datetime.now()))
	html = html.replace('{gtd}', str(gtd))

	if not html:
	print 'reading template from %s failed' % filename
	return

	with codecs.open(filename.replace('tmpl', 'html'), 'w', 'utf-8') as file:
	file.write(html)

	return (bookcount, comiccount, authorcount)


	def get_author(title):
	# remove (9123912392925) from title
	title = re.sub(r'\(?[0-9]{6,}\)?', '', title)

	# split title into title and author. Usually separated by ':'.
	if title.count(':') == title.count(' - ') and title.count(':') > 0:
	parts = title.split(' - ')
	elif ':' in title:
	parts = title.split(':')
	# remove empty parts
	parts = filter(lambda title: title.strip(), parts)
	elif ' by ' in title:
	parts = title.split(' by ')
	else:
	parts = title.split(' - ')

	# take last part of title
	part = parts.pop()

	if len(parts) >= 1:
	# try to guess if string is either a list of authors or just part of
	# the title
	author = part.strip(': ')
	author = author.replace(' and ', ', ').replace(',,', ',')

	# remove middle name initials for easier heuristics about name
	simpleauthor = re.sub(r'\s[A-Z]\.', '', author)
	simpleauthor = simpleauthor.replace(' MSPT', '').strip()

	authorparts = len(simpleauthor.split(' '))
	commas = simpleauthor.count(',')

	# example: Andrew Hunt, David Thomas -> 4 names <= (1 comma + 1) * 2
	if authorparts > 3 and authorparts > (commas + 1) * 2 + 1:
	author = ''
	# more than three words without commas
	elif authorparts > 3 and commas == 0:
	author = ''

	if author != '':
	title = ': '.join(parts)
	else:
	author = ''

	# cleanup
	title = title.strip(' :')
	author = re.sub(r',([^\s])', r', \1', author)

	return (author, title)

	books = parse_books()
	count = produce_list(books)
	print "found %d books and %d comics. %d authors found." % count
	<!DOCTYPE html>
	<html>
	<head>
	<title>Back to Work Reading List</title>

	<meta name="viewport" content="width=device-width, initial-scale=1">

	<link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.2/css/bootstrap.min.css">
	<link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/sortable/0.6.0/css/sortable-theme-bootstrap.css">
	<style type="text/css" media="screen">
	footer {
	border-top: 1px solid #e0e0e0;
	margin-top: 30px;
	padding-top: 10px;
	}
	.table-hover>tbody>tr:hover {
	background-color: #f0f6ff;
	}
	</style>
	</head>
	<body>
	<div class="container-fluid">

	<div class="page-header row">
	<div class="col-xs-12 col-md-6">
	<h1>Back to Work Reading List</h1>
	<p><a href="http://5by5.tv/b2w">Back to Work</a> is a podcast about <a href="#comics">comics</a>
	and <a href="#books">productivity books</a>.
	This is a list of the show's recommended reading, as gathered from show notes.
	</p>
	<p>Some errors and omissions are possible because the list is in fact compiled from RSS by a robot.
	Most of the links go to amazon.com and those are 5by5's affiliate links.
	</p>
	</div>
	</div>

	<div class="books">
	<a name="books"></a>
	<h2>Books</h2>
	<p>A total of {bookcount} books have been recommended so far. {gtd} of them are
	<a href="http://www.amazon.com/exec/obidos/ASIN/0142000280/5by5-20">Getting Things Done</a>,
	©DavidCo 2001.</p>

	<table class="table table-hover table-condensed sortable-theme-bootstrap" data-sortable>
	<thead>
	<tr>
	<th>Title</th>
	<th>Author</th>
	<th>Episode</th>
	</tr>
	</thead>
	<tbody>
	{tablebody}
	</tbody>
	</table>

	</div>

	<div class="comics">
	<a name="comics"></a>
	<h2>Comics</h2>
	<p>A total of {comiccount} comics have been recommended.</p>

	<table class="table table-hover table-condensed sortable-theme-bootstrap" data-sortable>
	<thead>
	<tr>
	<th>Title</th>
	<th>Author</th>
	<th>Episode</th>
	</tr>
	</thead>
	<tbody>
	{comicbody}
	</tbody>
	</table>

	</div>

	<footer class="footer">
	<p>List created by <a href="https://twitter.com/oiva">Oiva</a>. Updated on {date}.
	Lovingly created with <a href="https://gist.github.com/oiva/400a383e8f271c84d4a5">Python</a>.</p>
	</footer>
	</div>

	<script src="http://cdnjs.cloudflare.com/ajax/libs/sortable/0.6.0/js/sortable.min.js"></script>
	</body>
	</html>