ergoithz/humbledup.py

## humbledup.py
#!/usr/bin/env python
# -*- coding: utf-8 -*-

import re
import lxml, lxml.etree as etree
import collections
import urllib2
import pprint
import logging
import cPickle

logger = logging.getLogger(__name__)

owned = '''
    Humble Bundle for Android 2
    Humble Bundle for Android 4
    Humble Bundle with Android 5
    Humble Bundle: PC and Android 7
    Humble Bundle: PC and Android 8
    Humble Bundle: PC and Android 9
    Humble Cartoon Network Mobile Bundle
    Humble Crescent Moon Mobile Bundle
    Humble Double Fine Bundle
    Humble Indie Bundle 13
    Humble Indie Bundle 14
    Humble Indie Bundle 6
    Humble Indie Bundle 7
    Humble Indie Bundle 9
    Humble Indie Bundle X
    Humble Mo-boo!-ile Bundle
    Humble Mobile Bundle
    Humble Mobile Bundle 10
    Humble Mobile Bundle 2
    Humble Mobile Bundle 4
    Humble Mobile Bundle 5
    Humble Mobile Bundle 7
    Humble Mobile Bundle 9
    Humble PC & Android Bundle 12
    Humble SEGA Mobile Bundle
    Humble Weekly Sale: Egosoft
    Metro 2033
    Tropico 3
    '''
owned = map(str.strip, owned.strip().splitlines())

iteritems = dict.iteritems
itervalues = dict.itervalues
parser = etree.HTMLParser()
urlcache = {}

def get_string(elm):
    return unicode(
        etree.tostring(elm, method='text', encoding='UTF-8').strip(),
        'utf-8'
        )

def get_xml(elm):
    return unicode(
        etree.tostring(elm,  method='xml', encoding='UTF-8').strip(),
        'utf-8'
        )

def get_etree(url):
    if not url in urlcache:
        f = urllib2.urlopen(url)
        urlcache[url] = f.read()
        f.close()
    tree = etree.XML(urlcache[url], parser)
    return tree

def get_games_wikipedia(bundlelist):
    games = collections.defaultdict(list)
    url = 'http://en.wikipedia.org/wiki/List_of_Humble_Bundles'
    tree = get_etree(url)
    for tr in tree.xpath('//table[@class="wikitable"]/tr'):
        tds = tr.xpath('./td')
        if len(tds)==8:
            bundle = get_string(tr.xpath('./td[position()=1]')[0])
            name = get_string(tr.xpath('./td[position()=3]/i')[0])
        elif 1<len(tds)<4:
            try:
                name = get_string(tr.xpath('./td[position()=1]/i')[0])
            except IndexError:
                continue
        else:
            continue
        games[name].append(bundle)
    return dict(games)

def get_games_epicbundle(bundlelist):
    games = collections.defaultdict(list)
    urls = [
        'http://www.epicbundle.com/bundle-by/humble-bundle/',
        'http://www.epicbundle.com/bundle-by/humble-bundle/?page=1',
        ]
    try:
        tree = get_etree(urls[-1])
        rawpages = get_string(tree.xpath('//div[@class="pagerBox"]/div/span[@class="page"]')[0])
        pages = int(rawpages.replace('Page 1 of ', '').strip())
        urls.extend(
            'http://www.epicbundle.com/bundle-by/humble-bundle/?page=%d' % i
            for i in xrange(2, pages+1)
            )
    except IndexError:
        pass

    for url in urls:
        tree = get_etree(url)
        for link in tree.xpath('//li[@class="bundleItem bundleItemHilightBG"]/a'):
            unwanted = link.findall('./span')
            for stuff in unwanted:
                link.remove(stuff)
            bundle = get_string(link)
            if bundle in bundlelist:
                tree2 = get_etree(link.attrib['href'])
                titles = tree2.xpath('//div[@class="bundleHeadline"]/h2')
                for title in titles:
                    name = get_string(title)
                    games[name].append(bundle)
    return dict(games)


def iter_duplicated(bundlelist=()):
    bundleset = frozenset(bundlelist)
    games = get_games_epicbundle(bundlelist)
    # Verify bundleset
    for bundle in bundleset:
        for bundlelist in itervalues(games):
            if bundle in bundlelist:
                break
        else:
            logger.warn('Bundle %r is unknown' % bundle)
    # Yield games
    for game, bundles in iteritems(games):
        if len(bundles) == 1:
            continue
        if bundleset:
            intersection = bundleset.intersection(bundles)
            if len(intersection) > 1:
                yield game, intersection
        else:
            yield game, bundles

if __name__ == '__main__':
    logging.basicConfig()

    pprint.pprint(dict(iter_duplicated(owned)))
	#!/usr/bin/env python
	# -- coding: utf-8 --

	import re
	import lxml, lxml.etree as etree
	import collections
	import urllib2
	import pprint
	import logging
	import cPickle

	logger = logging.getLogger(__name__)

	owned = '''
	Humble Bundle for Android 2
	Humble Bundle for Android 4
	Humble Bundle with Android 5
	Humble Bundle: PC and Android 7
	Humble Bundle: PC and Android 8
	Humble Bundle: PC and Android 9
	Humble Cartoon Network Mobile Bundle
	Humble Crescent Moon Mobile Bundle
	Humble Double Fine Bundle
	Humble Indie Bundle 13
	Humble Indie Bundle 14
	Humble Indie Bundle 6
	Humble Indie Bundle 7
	Humble Indie Bundle 9
	Humble Indie Bundle X
	Humble Mo-boo!-ile Bundle
	Humble Mobile Bundle
	Humble Mobile Bundle 10
	Humble Mobile Bundle 2
	Humble Mobile Bundle 4
	Humble Mobile Bundle 5
	Humble Mobile Bundle 7
	Humble Mobile Bundle 9
	Humble PC & Android Bundle 12
	Humble SEGA Mobile Bundle
	Humble Weekly Sale: Egosoft
	Metro 2033
	Tropico 3
	'''
	owned = map(str.strip, owned.strip().splitlines())

	iteritems = dict.iteritems
	itervalues = dict.itervalues
	parser = etree.HTMLParser()
	urlcache = {}

	def get_string(elm):
	return unicode(
	etree.tostring(elm, method='text', encoding='UTF-8').strip(),
	'utf-8'
	)

	def get_xml(elm):
	return unicode(
	etree.tostring(elm, method='xml', encoding='UTF-8').strip(),
	'utf-8'
	)

	def get_etree(url):
	if not url in urlcache:
	f = urllib2.urlopen(url)
	urlcache[url] = f.read()
	f.close()
	tree = etree.XML(urlcache[url], parser)
	return tree

	def get_games_wikipedia(bundlelist):
	games = collections.defaultdict(list)
	url = 'http://en.wikipedia.org/wiki/List_of_Humble_Bundles'
	tree = get_etree(url)
	for tr in tree.xpath('//table[@class="wikitable"]/tr'):
	tds = tr.xpath('./td')
	if len(tds)==8:
	bundle = get_string(tr.xpath('./td[position()=1]')[0])
	name = get_string(tr.xpath('./td[position()=3]/i')[0])
	elif 1<len(tds)<4:
	try:
	name = get_string(tr.xpath('./td[position()=1]/i')[0])
	except IndexError:
	continue
	else:
	continue
	games[name].append(bundle)
	return dict(games)

	def get_games_epicbundle(bundlelist):
	games = collections.defaultdict(list)
	urls = [
	'http://www.epicbundle.com/bundle-by/humble-bundle/',
	'http://www.epicbundle.com/bundle-by/humble-bundle/?page=1',
	]
	try:
	tree = get_etree(urls[-1])
	rawpages = get_string(tree.xpath('//div[@class="pagerBox"]/div/span[@class="page"]')[0])
	pages = int(rawpages.replace('Page 1 of ', '').strip())
	urls.extend(
	'http://www.epicbundle.com/bundle-by/humble-bundle/?page=%d' % i
	for i in xrange(2, pages+1)
	)
	except IndexError:
	pass

	for url in urls:
	tree = get_etree(url)
	for link in tree.xpath('//li[@class="bundleItem bundleItemHilightBG"]/a'):
	unwanted = link.findall('./span')
	for stuff in unwanted:
	link.remove(stuff)
	bundle = get_string(link)
	if bundle in bundlelist:
	tree2 = get_etree(link.attrib['href'])
	titles = tree2.xpath('//div[@class="bundleHeadline"]/h2')
	for title in titles:
	name = get_string(title)
	games[name].append(bundle)
	return dict(games)


	def iter_duplicated(bundlelist=()):
	bundleset = frozenset(bundlelist)
	games = get_games_epicbundle(bundlelist)
	# Verify bundleset
	for bundle in bundleset:
	for bundlelist in itervalues(games):
	if bundle in bundlelist:
	break
	else:
	logger.warn('Bundle %r is unknown' % bundle)
	# Yield games
	for game, bundles in iteritems(games):
	if len(bundles) == 1:
	continue
	if bundleset:
	intersection = bundleset.intersection(bundles)
	if len(intersection) > 1:
	yield game, intersection
	else:
	yield game, bundles

	if __name__ == '__main__':
	logging.basicConfig()

	pprint.pprint(dict(iter_duplicated(owned)))