Skip to content

Instantly share code, notes, and snippets.

@ergoithz
Last active August 29, 2015 14:00
Show Gist options
  • Save ergoithz/fc7eb67e4e4893d2a7a2 to your computer and use it in GitHub Desktop.
Save ergoithz/fc7eb67e4e4893d2a7a2 to your computer and use it in GitHub Desktop.
Humble Bundle duplicate finder
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import re
import lxml, lxml.etree as etree
import collections
import urllib2
import pprint
import logging
import cPickle
logger = logging.getLogger(__name__)
owned = '''
Humble Bundle for Android 2
Humble Bundle for Android 4
Humble Bundle with Android 5
Humble Bundle: PC and Android 7
Humble Bundle: PC and Android 8
Humble Bundle: PC and Android 9
Humble Cartoon Network Mobile Bundle
Humble Crescent Moon Mobile Bundle
Humble Double Fine Bundle
Humble Indie Bundle 13
Humble Indie Bundle 14
Humble Indie Bundle 6
Humble Indie Bundle 7
Humble Indie Bundle 9
Humble Indie Bundle X
Humble Mo-boo!-ile Bundle
Humble Mobile Bundle
Humble Mobile Bundle 10
Humble Mobile Bundle 2
Humble Mobile Bundle 4
Humble Mobile Bundle 5
Humble Mobile Bundle 7
Humble Mobile Bundle 9
Humble PC & Android Bundle 12
Humble SEGA Mobile Bundle
Humble Weekly Sale: Egosoft
Metro 2033
Tropico 3
'''
owned = map(str.strip, owned.strip().splitlines())
iteritems = dict.iteritems
itervalues = dict.itervalues
parser = etree.HTMLParser()
urlcache = {}
def get_string(elm):
return unicode(
etree.tostring(elm, method='text', encoding='UTF-8').strip(),
'utf-8'
)
def get_xml(elm):
return unicode(
etree.tostring(elm, method='xml', encoding='UTF-8').strip(),
'utf-8'
)
def get_etree(url):
if not url in urlcache:
f = urllib2.urlopen(url)
urlcache[url] = f.read()
f.close()
tree = etree.XML(urlcache[url], parser)
return tree
def get_games_wikipedia(bundlelist):
games = collections.defaultdict(list)
url = 'http://en.wikipedia.org/wiki/List_of_Humble_Bundles'
tree = get_etree(url)
for tr in tree.xpath('//table[@class="wikitable"]/tr'):
tds = tr.xpath('./td')
if len(tds)==8:
bundle = get_string(tr.xpath('./td[position()=1]')[0])
name = get_string(tr.xpath('./td[position()=3]/i')[0])
elif 1<len(tds)<4:
try:
name = get_string(tr.xpath('./td[position()=1]/i')[0])
except IndexError:
continue
else:
continue
games[name].append(bundle)
return dict(games)
def get_games_epicbundle(bundlelist):
games = collections.defaultdict(list)
urls = [
'http://www.epicbundle.com/bundle-by/humble-bundle/',
'http://www.epicbundle.com/bundle-by/humble-bundle/?page=1',
]
try:
tree = get_etree(urls[-1])
rawpages = get_string(tree.xpath('//div[@class="pagerBox"]/div/span[@class="page"]')[0])
pages = int(rawpages.replace('Page 1 of ', '').strip())
urls.extend(
'http://www.epicbundle.com/bundle-by/humble-bundle/?page=%d' % i
for i in xrange(2, pages+1)
)
except IndexError:
pass
for url in urls:
tree = get_etree(url)
for link in tree.xpath('//li[@class="bundleItem bundleItemHilightBG"]/a'):
unwanted = link.findall('./span')
for stuff in unwanted:
link.remove(stuff)
bundle = get_string(link)
if bundle in bundlelist:
tree2 = get_etree(link.attrib['href'])
titles = tree2.xpath('//div[@class="bundleHeadline"]/h2')
for title in titles:
name = get_string(title)
games[name].append(bundle)
return dict(games)
def iter_duplicated(bundlelist=()):
bundleset = frozenset(bundlelist)
games = get_games_epicbundle(bundlelist)
# Verify bundleset
for bundle in bundleset:
for bundlelist in itervalues(games):
if bundle in bundlelist:
break
else:
logger.warn('Bundle %r is unknown' % bundle)
# Yield games
for game, bundles in iteritems(games):
if len(bundles) == 1:
continue
if bundleset:
intersection = bundleset.intersection(bundles)
if len(intersection) > 1:
yield game, intersection
else:
yield game, bundles
if __name__ == '__main__':
logging.basicConfig()
pprint.pprint(dict(iter_duplicated(owned)))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment