Skip to content

Instantly share code, notes, and snippets.

@kvnxiao
Last active March 14, 2018 11:11
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save kvnxiao/c2a98dcc9aac5eb483e2c8a9790461b5 to your computer and use it in GitHub Desktop.
Save kvnxiao/c2a98dcc9aac5eb483e2c8a9790461b5 to your computer and use it in GitHub Desktop.
taboo_scraper.py
#!/usr/bin/env python
from collections import namedtuple
from bs4 import BeautifulSoup
import asyncio
import aiohttp
import tqdm
import json
import sys
import argparse
class Word:
def __init__(self, word, others):
self.word = word
self.others = others
class WordJsonEncoder(json.JSONEncoder):
def default(self, obj): # pylint: disable=E0202
if isinstance(obj, Word):
return {"word": obj.word, "others": obj.others}
return json.JSONEncoder.default(self, obj)
def tag_to_text(tag):
return tag.text.lower()
def num_words(args):
number = args.number
if number is None:
return 1
else:
try:
return int(number)
except:
return 1
def output(data, args):
filename = args.output
if filename is None:
filename = "output.json"
with open(filename, "w") as outfile:
if args.pretty:
json.dump(data, outfile, cls=WordJsonEncoder, indent=2)
else:
json.dump(data, outfile, cls=WordJsonEncoder)
async def fetch_words(url):
async with aiohttp.ClientSession() as session:
async with session.get(url) as response:
content = await response.read()
soup = BeautifulSoup(content, "html.parser")
word = list(
map(tag_to_text, soup.find_all("h2", ["game-word"])))[0]
others = list(map(tag_to_text, soup.find_all("li")))
return Word(word, others)
async def main_loop():
url = "http://playtaboo.com/ajax/v1/next"
word_list = []
word_set = set()
# parse cmd-line args
parser = argparse.ArgumentParser()
parser.add_argument("-o", "--output", help="Output file name (defaults to \"output.json\")")
parser.add_argument("-n", "--number", help="Number of words to fetch (defaults to 1).")
parser.add_argument("-p", "--pretty", help="Whether to pretty-print the JSON or not.", action="store_true")
args = parser.parse_args()
tasks = [asyncio.Task(fetch_words(url))
for _ in range(num_words(args))]
for task in tqdm.tqdm(asyncio.as_completed(tasks), total=len(tasks)):
word = await task
# Avoid duplicates
if (word.word not in word_set):
word_set.add(word.word)
word_list.append(word)
output(word_list, args)
if __name__ == "__main__":
loop = asyncio.get_event_loop()
loop.run_until_complete(main_loop())
loop.close
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment