Skip to content

Instantly share code, notes, and snippets.

@hufman
Last active January 2, 2016 22:37
Show Gist options
  • Save hufman/17b96b58cb78d416ee81 to your computer and use it in GitHub Desktop.
Save hufman/17b96b58cb78d416ee81 to your computer and use it in GitHub Desktop.
Animal Crossing Wikia Parser
You'll need to `pip install mwparserfromhell` to run this code. Tested in Python 2.7 and 3.2.
#!/usr/bin/env python
import codecs
import json
import re
import sys
import mwparserfromhell
import logging
if sys.version_info[0] == 3:
from urllib.parse import quote as urlquote
from urllib.request import urlopen as urlopen
basestring = str
else:
from urllib import quote as urlquote
from urllib import urlopen as urlopen
basestring = basestring
def urlquote_titles(titles):
# just a single title
if isinstance(titles, basestring):
return urlquote(titles)
# list of titles
else:
return '|'.join((urlquote(t) for t in titles))
API_BASE = "http://animalcrossing.wikia.com/api.php"
CATEGORY_MEMBERS_URL = lambda category: "%s?action=query&list=categorymembers&format=json&cmtype=page&cmlimit=max&cmtitle=Category:%s" % (API_BASE, urlquote(category))
INFOBOX_URL = lambda titles: "%s?action=query&prop=revisions&rvprop=content&rvsection=0&format=json&titles=%s" % (API_BASE, urlquote_titles(titles),)
ARTICLE_CATEGORIES_URL = lambda titles: "%s?action=query&prop=categories&format=json&titles=%s" % (API_BASE, urlquote_titles(titles),)
def category_members(category):
members = []
url = CATEGORY_MEMBERS_URL(category)
if url:
with urlopen(url) as reader:
utf8reader = codecs.getreader('utf-8')
data = json.load(utf8reader(reader))
members.extend(data['query']['categorymembers'])
if 'query-continue' in data:
cookie = data['query-continue']['categorymembers']['cmcontinue']
url = INFOBOX_URL(titles) + "&cmcontinue:%s" % (cookie, )
else:
url = None
return members
def ripout_infobox(firstsection):
""" Return the raw infobox data out of the first section of data """
code = mwparserfromhell.parse(firstsection)
for template in code.ifilter_templates(recursive=False):
if 'Infobox' in template.name:
return str(template)
return ''
def fetch_infobox(titles):
""" Return the raw infobox data for the given titles """
url = INFOBOX_URL(titles)
with urlopen(url) as reader:
utf8reader = codecs.getreader('utf-8')
data = json.load(utf8reader(reader))
fetched = {}
for pageid, data in data['query']['pages'].items():
revision_data = list(data['revisions'][0].values())[0]
fetched[pageid] = {
'pageid': data['pageid'],
'ns': data['ns'],
'title': data['title'],
'data': ripout_infobox(revision_data)
}
return fetched
def parse_infobox(infobox_data):
code = mwparserfromhell.parse(infobox_data['data'])
templates = code.filter_templates()
if len(templates) < 1 or 'Infobox' not in templates[0].name:
raise ValueError("Could not find Infobox")
infobox_code = templates[0]
infobox_name = infobox_code.name
infobox_classname = infobox_name.replace(' ', '')
infobox_classnames = [name for name in globals() if 'Infobox' in name]
if infobox_classname in infobox_classnames:
infobox_class = globals()[infobox_classname]
else:
infobox_class = Infobox
return infobox_class(infobox_data['pageid'], infobox_data['title'], infobox_data['data'])
class GameSpecificValue(object):
DEFAULT_GAME = 'NL'
def __init__(self, values):
self.values = values
self.display_game = GameSpecificValue.DEFAULT_GAME
def games(self):
return list(self.values.keys())
def as_game(self, game=None):
if game is None:
game = self.display_game
return self.values.get(game, self.values.get(''))
def __str__(self):
return self.as_game()
def __repr__(self):
return "<GSV: %r>" % (self.values,)
class Infobox(object):
def __init__(self, pageid, title, data):
self.pageid = pageid
self.title = title
self.data = data
code = mwparserfromhell.parse(data)
self.code = code.filter_templates()[0]
self.params_mw = {}
self.params = {}
self._parse()
def to_dict(self):
return dict(self.params)
def _parse(self):
for param in self.code.params:
if param.showkey: # is a key=value param
name = str(param.name).strip()
value = param.value
self.params_mw[name] = value
self._parse_param('appearance')
self._parse_params()
def _parse_params(self):
for name in self.params_mw:
self._parse_param(name)
def _parse_param(self, name):
value = self.params_mw.get(name)
if not value:
# somehow could not load up this name
return
parser = '_parse_%s' % (name,)
if parser in dir(self):
value = getattr(self, parser)(value)
else:
value = self._parse_default(value)
self.params[name] = value
def _parse_default(self, value):
return value.strip_code().strip()
def _parse_appearances(self, value):
order = ['af', 'af+', 'ac', 'afe+', 'ww', 'cf', 'nl']
templates = value.filter_templates()
if len(templates) == 1 and templates[0].name == 'since':
since = templates[0].params[0].value
index = order.index(since)
return order[index:]
elif len(templates) == 1 and templates[0].name == 'until':
since = templates[0].params[0].value
index = order.index(since)
return order[:index+1]
else:
appearances = []
for template in templates:
if len(template.params) == 0:
appearances.append(template.name.lower())
elif len(template.params) == 1 and \
template.params[0].value == 'shortest':
appearances.append(template.name.lower())
else:
logging.warn("Unknown Appearance: %s" % (template,))
appearances.append(str(value))
return appearances
def _parse_nameother(self, value):
names = {}
lang = None
just_saw_lang = False
for node in value.nodes:
if isinstance(node, mwparserfromhell.nodes.template.Template):
lang = node.name.strip_code()
just_saw_lang = True
continue
if just_saw_lang:
text = node.value.strip()
names[lang] = text
just_saw_lang = False
return names
def _parse_gamespecific_value(self, value):
values = self._parse_categorized_value(value)
return GameSpecificValue(values)
def _parse_categorized_value(self, value):
categorized = {}
value = re.sub('<[^>]*?>', '', str(value))
value = re.sub('[{}]', '', value)
value = re.sub('[\[\]]', '', value)
segment_finder = re.compile(r'\s*((?:[0-9,]+ [^(,]*)|[^(,]*)\s*(?:\(([^)]*)\))?(?:[\s,])*')
for segment in segment_finder.finditer(value):
parsed_groups = segment.groups()
curvalue = parsed_groups[0]
if curvalue is None or curvalue.strip() == '':
continue
curvalue = curvalue.strip()
if len(parsed_groups) > 1 and parsed_groups[1] is not None:
for category in parsed_groups[1].split(','):
category = category.strip()
categorized[category] = curvalue
else:
categorized[''] = curvalue
return categorized
_parse_location = _parse_gamespecific_value
_parse_price = _parse_gamespecific_value
_parse_shadow = _parse_gamespecific_value
_parse_size = _parse_gamespecific_value
_parse_timeday = _parse_categorized_value
if __name__ == '__main__':
allfish = category_members('Fish')
allfishnames = (f['title'] for f in allfish)
data = fetch_infobox(allfishnames)
data = fetch_infobox('Char')
for pageid, pagedata in data.items():
try:
infobox = parse_infobox(pagedata)
print("%s" % (infobox.to_dict(), ))
except Exception as e:
print("Error parsing %s: %s" % (pagedata['title'], e))
#print(pagedata['data'])
raise
import unittest
import animalcrossing
class TestParsingParams(unittest.TestCase):
def setUp(self):
self.instance = animalcrossing.Infobox('0', 'Test', '{{Test}}')
def testSingleValue(self):
ret = self.instance._parse_categorized_value('Large')
self.assertEqual(ret, {'': 'Large'})
def testDoubleValue(self):
ret = self.instance._parse_categorized_value('Large, Small')
self.assertEqual(ret, {'': 'Small'})
def testSingleParensValue(self):
ret = self.instance._parse_categorized_value('Large (One)')
self.assertEqual(ret, {'One': 'Large'})
def testDoubleParensValue(self):
ret = self.instance._parse_categorized_value('Large (One, Two)')
self.assertEqual(ret, {'One': 'Large', 'Two': 'Large'})
def testTwoSingleParensValue(self):
ret = self.instance._parse_categorized_value('Small (One), Large (Two)')
self.assertEqual(ret, {'One': 'Small', 'Two': 'Large'})
def testTwoSingleExtraParensValue(self):
ret = self.instance._parse_categorized_value('Small (One), Large (Two, Three)')
self.assertEqual(ret, {'One': 'Small', 'Two': 'Large', 'Three': 'Large'})
def testSingleTwoSingleExtraParensValue(self):
ret = self.instance._parse_categorized_value('Bonus, Small (One), Large (Two, Three)')
self.assertEqual(ret, {'': 'Bonus', 'One': 'Small', 'Two': 'Large', 'Three': 'Large'})
def testBells(self):
ret = self.instance._parse_categorized_value('10,000 Bells, 8,000 Bells (WW)')
self.assertEqual(ret, {'': '10,000 Bells', 'WW': '8,000 Bells'})
class TestGSV(unittest.TestCase):
def testEmptyValue(self):
gsv = animalcrossing.GameSpecificValue({})
self.assertEqual([], gsv.games())
self.assertEqual(None, gsv.as_game())
def testAllValue(self):
gsv = animalcrossing.GameSpecificValue({'':'Yes'})
self.assertEqual([''], gsv.games())
self.assertEqual('Yes', gsv.as_game())
def testTwoValue(self):
animalcrossing.GameSpecificValue.DEFAULT_GAME = 'NL'
gsv = animalcrossing.GameSpecificValue({'':'Yes', 'WW':'No'})
self.assertEqual(['', 'WW'], gsv.games())
self.assertEqual('Yes', gsv.as_game())
self.assertEqual('No', gsv.as_game('WW'))
def testGameValue(self):
animalcrossing.GameSpecificValue.DEFAULT_GAME = 'NL'
gsv = animalcrossing.GameSpecificValue({'':'Yes', 'WW':'No'})
self.assertEqual('NL', gsv.display_game)
self.assertEqual(['', 'WW'], gsv.games())
gsv.display_game = 'WW'
self.assertEqual('No', gsv.as_game())
def testDefaultValue(self):
animalcrossing.GameSpecificValue.DEFAULT_GAME = 'WW'
gsv = animalcrossing.GameSpecificValue({'':'Yes', 'WW':'No'})
self.assertEqual('WW', gsv.display_game)
self.assertEqual(['', 'WW'], gsv.games())
self.assertEqual('No', gsv.as_game())
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment