Skip to content

Instantly share code, notes, and snippets.

@blinks
Created September 30, 2014 04:51
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save blinks/9d783f0399617523a851 to your computer and use it in GitHub Desktop.
Save blinks/9d783f0399617523a851 to your computer and use it in GitHub Desktop.
gather.py
#!/usr/bin/env python
# Gather cards from wizards.com into local Elasticsearch.
# Adam Blinkinsop <http://plus.google.com/+AdamBlinkinsop>
from elasticsearch import Elasticsearch
import bs4
import re
import urllib
def main(args):
sets = args.sets
if not sets:
filename, headers = urllib.urlretrieve('http://gatherer.wizards.com/')
soup = bs4.BeautifulSoup(open(filename))
sets = [t['value'] for t in soup.select(
'select#ctl00_ctl00_MainContent_Content_SearchControls_setAddText'
' > option') if t['value']]
es = Elasticsearch()
for name in sets:
for card in cardsOf(name):
es.index(index='cauldron', doc_type='card', id=card['cardTitle'], body=card)
def soupOf(name, output='standard', page=0):
url = ('http://gatherer.wizards.com/Pages/Search/Default.aspx?' +
urllib.urlencode({
'output': output, 'page': page,
'set': '["%s"]' % name,
}))
filename, headers = urllib.urlretrieve(url)
return bs4.BeautifulSoup(open(filename))
def cardsOf(name):
page = 0
while True:
print name, page
soup = soupOf(name, page=page)
for item in soup.find_all('tr', class_='cardItem'):
yield {
'cardTitle': oracleOf(item, 'span.cardTitle'),
'manaCost': oracleOf(item, 'span.manaCost'),
'convertedManaCost': oracleOf(item, 'span.convertedManaCost'),
'typeLine': oracleOf(item, 'span.typeLine'),
'rulesText': oracleOf(item, 'div.rulesText'),
'setVersions': oracleOf(item, 'td.setVersions'),
}
n = soup.find('div', class_='pagingcontrols').find(
text=lambda s: s.endswith('>'))
if n is None or n.find_parent('a') is None:
return
page += 1
def oracleOf(soup, selector):
soup = soup.select(selector)[0]
return re.sub(r' +', ' ', oracleOfHtml(soup))
def oracleOfHtml(soup):
if isinstance(soup, bs4.NavigableString):
return unicode(soup).strip()
elif soup.name == 'img':
return u'{%s}' % soup['alt']
else:
return ' '.join(oracleOfHtml(c) for c in soup.children).strip()
if __name__ == '__main__':
import argparse
parser = argparse.ArgumentParser(
description='Gather cards from wizards.com into Elasticsearch')
parser.add_argument('sets', metavar='N', type=unicode, nargs='*',
help='a set to gather')
args = parser.parse_args()
main(args)
@blinks
Copy link
Author

blinks commented Sep 30, 2014

TODO

  • Pull out power/toughness/loyalty from the typeline.
  • Use the bulk API for Elasticsearch.
  • Write a schema to this index to enable Lucene's numeric search features.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment