Skip to content

Instantly share code, notes, and snippets.

@msullivan
Created December 10, 2021 20:04
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save msullivan/17a6abba8281e5610e189db9d82b925c to your computer and use it in GitHub Desktop.
Save msullivan/17a6abba8281e5610e189db9d82b925c to your computer and use it in GitHub Desktop.
Scrape an advent of code problem description for inputs
#!/usr/bin/env python3
"""Script that tries to scrape all potential test inputs.
By default it writes all of them to files in the directory
<day>.tests. It also prints all of the contents out along
with the file names so that you can quickly inspect and
determine which you want to use.
Depends on advent-of-code-data 1.1.0 (later versions may
also work, but this depends on internals).
Also depends on beautifulsoup, but that should get picked
up by advent-of-code-data ;).
"""
from aocd.get import current_day, most_recent_year
from aocd.models import default_user, Puzzle
import bs4
import bs4.element
import html
import os.path
import argparse
# adapated from aocd internals
def get_puzzle(session=None, day=None, year=None):
"""
Get puzzle for day (1-25) and year (>= 2015)
User's session cookie is needed (puzzle inputs differ by user)
"""
if session is None:
user = default_user()
else:
user = User(token=session)
if day is None:
day = current_day()
if year is None:
year = most_recent_year()
puzzle = Puzzle(year=year, day=day, user=user)
return puzzle
def cleanup(el):
# strip out ems, since those appear in plenty of inputs
if isinstance(el, bs4.element.NavigableString):
return str(el)
elif isinstance(el, bs4.element.Tag) and el.name == 'em':
return cleanup(el.contents[0])
else:
return el
def slurp(soup):
codes = [y for x in soup.find_all('pre') if (y := x.find('code'))]
tests = []
for code in codes:
cleaned = [cleanup(x) for x in code.contents]
if all(isinstance(x, str) for x in cleaned):
tests.append(html.unescape(''.join(cleaned)))
return tests
def write(day, tests, dirname, dry=True):
if not dirname:
dirname = f'{day}.tests'
if not dry:
os.makedirs(dirname, exist_ok=True)
for i, test in enumerate(tests):
name = os.path.join(dirname, str(i + 1))
print(f'==== {name}')
print(test)
if not dry:
with open(name, 'w') as f:
f.write(test)
parser = argparse.ArgumentParser(description='AOC test input scraper')
parser.add_argument("day", nargs="?", type=int)
parser.add_argument("year", nargs="?", type=int)
parser.add_argument('--dry', '-d', action='store_true',
help='Do a dry run, without writing files')
parser.add_argument('--dir', type=str,
help='Override target directory')
def main():
args = parser.parse_args()
puzzle = get_puzzle(day=args.day, year=args.year)
tests = slurp(puzzle._soup())
write(puzzle.day, tests, dirname=args.dir, dry=args.dry)
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment