Last active
August 29, 2015 14:00
-
-
Save ergoithz/fc7eb67e4e4893d2a7a2 to your computer and use it in GitHub Desktop.
Humble Bundle duplicate finder
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- coding: utf-8 -*- | |
import re | |
import lxml, lxml.etree as etree | |
import collections | |
import urllib2 | |
import pprint | |
import logging | |
import cPickle | |
logger = logging.getLogger(__name__) | |
owned = ''' | |
Humble Bundle for Android 2 | |
Humble Bundle for Android 4 | |
Humble Bundle with Android 5 | |
Humble Bundle: PC and Android 7 | |
Humble Bundle: PC and Android 8 | |
Humble Bundle: PC and Android 9 | |
Humble Cartoon Network Mobile Bundle | |
Humble Crescent Moon Mobile Bundle | |
Humble Double Fine Bundle | |
Humble Indie Bundle 13 | |
Humble Indie Bundle 14 | |
Humble Indie Bundle 6 | |
Humble Indie Bundle 7 | |
Humble Indie Bundle 9 | |
Humble Indie Bundle X | |
Humble Mo-boo!-ile Bundle | |
Humble Mobile Bundle | |
Humble Mobile Bundle 10 | |
Humble Mobile Bundle 2 | |
Humble Mobile Bundle 4 | |
Humble Mobile Bundle 5 | |
Humble Mobile Bundle 7 | |
Humble Mobile Bundle 9 | |
Humble PC & Android Bundle 12 | |
Humble SEGA Mobile Bundle | |
Humble Weekly Sale: Egosoft | |
Metro 2033 | |
Tropico 3 | |
''' | |
owned = map(str.strip, owned.strip().splitlines()) | |
iteritems = dict.iteritems | |
itervalues = dict.itervalues | |
parser = etree.HTMLParser() | |
urlcache = {} | |
def get_string(elm): | |
return unicode( | |
etree.tostring(elm, method='text', encoding='UTF-8').strip(), | |
'utf-8' | |
) | |
def get_xml(elm): | |
return unicode( | |
etree.tostring(elm, method='xml', encoding='UTF-8').strip(), | |
'utf-8' | |
) | |
def get_etree(url): | |
if not url in urlcache: | |
f = urllib2.urlopen(url) | |
urlcache[url] = f.read() | |
f.close() | |
tree = etree.XML(urlcache[url], parser) | |
return tree | |
def get_games_wikipedia(bundlelist): | |
games = collections.defaultdict(list) | |
url = 'http://en.wikipedia.org/wiki/List_of_Humble_Bundles' | |
tree = get_etree(url) | |
for tr in tree.xpath('//table[@class="wikitable"]/tr'): | |
tds = tr.xpath('./td') | |
if len(tds)==8: | |
bundle = get_string(tr.xpath('./td[position()=1]')[0]) | |
name = get_string(tr.xpath('./td[position()=3]/i')[0]) | |
elif 1<len(tds)<4: | |
try: | |
name = get_string(tr.xpath('./td[position()=1]/i')[0]) | |
except IndexError: | |
continue | |
else: | |
continue | |
games[name].append(bundle) | |
return dict(games) | |
def get_games_epicbundle(bundlelist): | |
games = collections.defaultdict(list) | |
urls = [ | |
'http://www.epicbundle.com/bundle-by/humble-bundle/', | |
'http://www.epicbundle.com/bundle-by/humble-bundle/?page=1', | |
] | |
try: | |
tree = get_etree(urls[-1]) | |
rawpages = get_string(tree.xpath('//div[@class="pagerBox"]/div/span[@class="page"]')[0]) | |
pages = int(rawpages.replace('Page 1 of ', '').strip()) | |
urls.extend( | |
'http://www.epicbundle.com/bundle-by/humble-bundle/?page=%d' % i | |
for i in xrange(2, pages+1) | |
) | |
except IndexError: | |
pass | |
for url in urls: | |
tree = get_etree(url) | |
for link in tree.xpath('//li[@class="bundleItem bundleItemHilightBG"]/a'): | |
unwanted = link.findall('./span') | |
for stuff in unwanted: | |
link.remove(stuff) | |
bundle = get_string(link) | |
if bundle in bundlelist: | |
tree2 = get_etree(link.attrib['href']) | |
titles = tree2.xpath('//div[@class="bundleHeadline"]/h2') | |
for title in titles: | |
name = get_string(title) | |
games[name].append(bundle) | |
return dict(games) | |
def iter_duplicated(bundlelist=()): | |
bundleset = frozenset(bundlelist) | |
games = get_games_epicbundle(bundlelist) | |
# Verify bundleset | |
for bundle in bundleset: | |
for bundlelist in itervalues(games): | |
if bundle in bundlelist: | |
break | |
else: | |
logger.warn('Bundle %r is unknown' % bundle) | |
# Yield games | |
for game, bundles in iteritems(games): | |
if len(bundles) == 1: | |
continue | |
if bundleset: | |
intersection = bundleset.intersection(bundles) | |
if len(intersection) > 1: | |
yield game, intersection | |
else: | |
yield game, bundles | |
if __name__ == '__main__': | |
logging.basicConfig() | |
pprint.pprint(dict(iter_duplicated(owned))) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment