Last active
March 14, 2018 11:11
-
-
Save kvnxiao/c2a98dcc9aac5eb483e2c8a9790461b5 to your computer and use it in GitHub Desktop.
taboo_scraper.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
from collections import namedtuple | |
from bs4 import BeautifulSoup | |
import asyncio | |
import aiohttp | |
import tqdm | |
import json | |
import sys | |
import argparse | |
class Word: | |
def __init__(self, word, others): | |
self.word = word | |
self.others = others | |
class WordJsonEncoder(json.JSONEncoder): | |
def default(self, obj): # pylint: disable=E0202 | |
if isinstance(obj, Word): | |
return {"word": obj.word, "others": obj.others} | |
return json.JSONEncoder.default(self, obj) | |
def tag_to_text(tag): | |
return tag.text.lower() | |
def num_words(args): | |
number = args.number | |
if number is None: | |
return 1 | |
else: | |
try: | |
return int(number) | |
except: | |
return 1 | |
def output(data, args): | |
filename = args.output | |
if filename is None: | |
filename = "output.json" | |
with open(filename, "w") as outfile: | |
if args.pretty: | |
json.dump(data, outfile, cls=WordJsonEncoder, indent=2) | |
else: | |
json.dump(data, outfile, cls=WordJsonEncoder) | |
async def fetch_words(url): | |
async with aiohttp.ClientSession() as session: | |
async with session.get(url) as response: | |
content = await response.read() | |
soup = BeautifulSoup(content, "html.parser") | |
word = list( | |
map(tag_to_text, soup.find_all("h2", ["game-word"])))[0] | |
others = list(map(tag_to_text, soup.find_all("li"))) | |
return Word(word, others) | |
async def main_loop(): | |
url = "http://playtaboo.com/ajax/v1/next" | |
word_list = [] | |
word_set = set() | |
# parse cmd-line args | |
parser = argparse.ArgumentParser() | |
parser.add_argument("-o", "--output", help="Output file name (defaults to \"output.json\")") | |
parser.add_argument("-n", "--number", help="Number of words to fetch (defaults to 1).") | |
parser.add_argument("-p", "--pretty", help="Whether to pretty-print the JSON or not.", action="store_true") | |
args = parser.parse_args() | |
tasks = [asyncio.Task(fetch_words(url)) | |
for _ in range(num_words(args))] | |
for task in tqdm.tqdm(asyncio.as_completed(tasks), total=len(tasks)): | |
word = await task | |
# Avoid duplicates | |
if (word.word not in word_set): | |
word_set.add(word.word) | |
word_list.append(word) | |
output(word_list, args) | |
if __name__ == "__main__": | |
loop = asyncio.get_event_loop() | |
loop.run_until_complete(main_loop()) | |
loop.close |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment