harryposner/get_nyt_xword.py

## get_nyt_xword.py
#!/usr/bin/env python3

"""Scrape NYT crossword puzzles in AcrossLite format from xwordinfo.com

Usage: ./get_nyt_xword.py [puzzle-iso-date] filename
"""

import re
import sys
import datetime

import puz
import requests
from bs4 import BeautifulSoup


URL = "https://www.xwordinfo.com/Crossword"


def get_soup(puzzle_date=None):
    datestring = puzzle_date.strftime("%m/%d/%Y") if puzzle_date else None
    resp = requests.get(URL, data={"date": datestring})
    resp.raise_for_status()
    return BeautifulSoup(resp.text, features="html.parser")


def xword_info(soup):
    info = {}
    info.update(**read_stats(soup))
    info["author"] = read_author(soup)
    info["title"] = read_title(soup)
    info["solution"] = read_solution(soup)
    info["clues"] = read_clues(soup)
    info["extensions"] = read_extensions(soup)
    return info


def read_stats(soup):
    stats_elts = soup.find("div", id="CPHContent_StatsData").find_all("span")
    stats = {}
    for element in stats_elts:
        if element.text[-1].isdigit():
            for k, v in (pair.split(":") for pair in element.text.split(",")):
                k = k.strip()
                k = {"Rows": "height", "Columns": "width"}.get(k, k)
                stats[k] = int(v.strip())
    return stats


def read_title(soup):
    return soup.find("h1", id="PuzTitle").text


def read_author(soup):
    author_box = soup.find("div", id="CPHContent_AEGrid").text.strip()
    for k, v in (line.split(":") for line in author_box.splitlines()):
        if k.strip() == "Author":
            author = v.strip()
        elif k.strip() == "Editor":
            editor = v.strip()
    return f"{author} / {editor}"


def puzzle_squares(soup):
    puzzle_table = soup.find("table", id="PuzTable").find_all("td")
    for tag in puzzle_table:
        if tag.attrs.get("class") == ["black"]:
            yield {"val": puz.BLACKSQUARE, "markup": None}
            continue
        # `continue` above means `markup` won't ever be a black cell
        markup = bool(tag.attrs.get("class") or tag.find("div", style=True))
        letter_tag = tag.find("div", class_="letter")
        if letter_tag is not None:
            yield {"val": letter_tag.text, "markup": markup}
            continue
        rebus_tag = (tag.find("div", class_="subst2")
                     or tag.find("div", class_="subst"))
        if rebus_tag is not None:
            yield {"val": rebus_tag.text, "markup": markup}
            continue
        raise ValueError(f"Couldn't read puzzle tag: {tag}")


def read_solution(soup):
    return "".join(sq["val"][0] for sq in puzzle_squares(soup))


EMPTY = b"\x00"
MARKUP = b"\x80"
def read_extensions(soup):
    markup = []
    rebus_solutions = []
    rebus_locations = []
    for square in puzzle_squares(soup):
        if len(square["val"]) > 1:
            rebus_solutions.append(square["val"])
            rebus_locations.append((len(rebus_solutions)+1).to_bytes(1, "big"))
        else:
            rebus_locations.append(EMPTY)
        if square["markup"]:
            markup.append(MARKUP)
        else:
            markup.append(EMPTY)
    extensions = {}
    if rebus_solutions:
        extensions[puz.Extensions.Rebus] = b"".join(rebus_locations)
        rebus = [f"{i+1:2}:{word};" for i, word in enumerate(rebus_solutions)]
        rebus = "".join(rebus).encode("ascii")
        extensions[puz.Extensions.RebusSolutions] = rebus
    if len(set(markup)) > 1:
        extensions[puz.Extensions.Markup] = b"".join(markup)
    return extensions


def read_clues(soup):
    def extract_clues(panel):
        tag_iter = (t.find(text=True, recursive=False) for t in panel.children)
        clues = []
        for clue_number in tag_iter:
            clue = next(tag_iter).rstrip(" :")
            clues.append((int(clue_number), clue))
        return clues
    across, down = map(extract_clues, soup.find_all("div", class_="numclue"))
    # Across comes before down when clues have the same number
    all_clues = sorted(across + down, key=lambda num_and_clue: num_and_clue[0])
    return [clue for __, clue in all_clues]


def validate_scrape(info):
    assert info["Blocks"] == info["solution"].count(puz.BLACKSQUARE)
    assert len(info["solution"]) == info["height"] * info["width"]
    assert info["Words"] == len(info["clues"])


def get_puzzle(puzzle_date=None):
    puzzle = puz.Puzzle()
    soup = get_soup(puzzle_date)
    info = xword_info(soup)
    validate_scrape(info)
    for k, v in info.items():
        if k in puzzle.__dict__:
            puzzle.__setattr__(k, v)
    puzzle.fill = re.sub(r"[A-Z]", "-", puzzle.solution)
    # This relies on dictionaries preserving insertion order
    puzzle._extensions_order = list(puzzle.extensions.keys())
    return puzzle


def main(*argv):
    if len(argv) == 3:
        __, datestring, fname_out = argv
        puzzle_date = datetime.date.fromisoformat(datestring)
    elif len(argv) == 2:
        fname_out = argv[1]
        puzzle_date = None
    else:
        sys.stderr.write(f"usage: {argv[0]} [puzzle-iso-date] file\n")
        sys.stderr.write(f"{argv[0]}: error: the file argument is required\n")
        return 2
    puzzle = get_puzzle(puzzle_date)
    puzzle.save(fname_out)
    print(f"Saved {puzzle.title} to {fname_out}")
    return 0


def get_unreadable_puzzles(start=datetime.date(1993, 11, 21), delay=1):
    import time
    def daterange(start, stop):
        current = start
        while current < stop:
            yield current
            current += datetime.timedelta(1)
    unreadable = []
    for puzzle_date in daterange(start, datetime.date.today()):
        try:
            get_puzzle(puzzle_date)
        except Exception:
            unreadable.append(puzzle_date)
            print(puzzle_date)
        finally:
            time.sleep(delay)
    return unreadable


if __name__ == "__main__":
    sys.exit(main(*sys.argv))
	#!/usr/bin/env python3

	"""Scrape NYT crossword puzzles in AcrossLite format from xwordinfo.com

	Usage: ./get_nyt_xword.py [puzzle-iso-date] filename
	"""

	import re
	import sys
	import datetime

	import puz
	import requests
	from bs4 import BeautifulSoup


	URL = "https://www.xwordinfo.com/Crossword"


	def get_soup(puzzle_date=None):
	datestring = puzzle_date.strftime("%m/%d/%Y") if puzzle_date else None
	resp = requests.get(URL, data={"date": datestring})
	resp.raise_for_status()
	return BeautifulSoup(resp.text, features="html.parser")


	def xword_info(soup):
	info = {}
	info.update(**read_stats(soup))
	info["author"] = read_author(soup)
	info["title"] = read_title(soup)
	info["solution"] = read_solution(soup)
	info["clues"] = read_clues(soup)
	info["extensions"] = read_extensions(soup)
	return info


	def read_stats(soup):
	stats_elts = soup.find("div", id="CPHContent_StatsData").find_all("span")
	stats = {}
	for element in stats_elts:
	if element.text[-1].isdigit():
	for k, v in (pair.split(":") for pair in element.text.split(",")):
	k = k.strip()
	k = {"Rows": "height", "Columns": "width"}.get(k, k)
	stats[k] = int(v.strip())
	return stats


	def read_title(soup):
	return soup.find("h1", id="PuzTitle").text


	def read_author(soup):
	author_box = soup.find("div", id="CPHContent_AEGrid").text.strip()
	for k, v in (line.split(":") for line in author_box.splitlines()):
	if k.strip() == "Author":
	author = v.strip()
	elif k.strip() == "Editor":
	editor = v.strip()
	return f"{author} / {editor}"


	def puzzle_squares(soup):
	puzzle_table = soup.find("table", id="PuzTable").find_all("td")
	for tag in puzzle_table:
	if tag.attrs.get("class") == ["black"]:
	yield {"val": puz.BLACKSQUARE, "markup": None}
	continue
	# `continue` above means `markup` won't ever be a black cell
	markup = bool(tag.attrs.get("class") or tag.find("div", style=True))
	letter_tag = tag.find("div", class_="letter")
	if letter_tag is not None:
	yield {"val": letter_tag.text, "markup": markup}
	continue
	rebus_tag = (tag.find("div", class_="subst2")
	or tag.find("div", class_="subst"))
	if rebus_tag is not None:
	yield {"val": rebus_tag.text, "markup": markup}
	continue
	raise ValueError(f"Couldn't read puzzle tag: {tag}")


	def read_solution(soup):
	return "".join(sq["val"][0] for sq in puzzle_squares(soup))


	EMPTY = b"\x00"
	MARKUP = b"\x80"
	def read_extensions(soup):
	markup = []
	rebus_solutions = []
	rebus_locations = []
	for square in puzzle_squares(soup):
	if len(square["val"]) > 1:
	rebus_solutions.append(square["val"])
	rebus_locations.append((len(rebus_solutions)+1).to_bytes(1, "big"))
	else:
	rebus_locations.append(EMPTY)
	if square["markup"]:
	markup.append(MARKUP)
	else:
	markup.append(EMPTY)
	extensions = {}
	if rebus_solutions:
	extensions[puz.Extensions.Rebus] = b"".join(rebus_locations)
	rebus = [f"{i+1:2}:{word};" for i, word in enumerate(rebus_solutions)]
	rebus = "".join(rebus).encode("ascii")
	extensions[puz.Extensions.RebusSolutions] = rebus
	if len(set(markup)) > 1:
	extensions[puz.Extensions.Markup] = b"".join(markup)
	return extensions


	def read_clues(soup):
	def extract_clues(panel):
	tag_iter = (t.find(text=True, recursive=False) for t in panel.children)
	clues = []
	for clue_number in tag_iter:
	clue = next(tag_iter).rstrip(" :")
	clues.append((int(clue_number), clue))
	return clues
	across, down = map(extract_clues, soup.find_all("div", class_="numclue"))
	# Across comes before down when clues have the same number
	all_clues = sorted(across + down, key=lambda num_and_clue: num_and_clue[0])
	return [clue for __, clue in all_clues]


	def validate_scrape(info):
	assert info["Blocks"] == info["solution"].count(puz.BLACKSQUARE)
	assert len(info["solution"]) == info["height"] * info["width"]
	assert info["Words"] == len(info["clues"])


	def get_puzzle(puzzle_date=None):
	puzzle = puz.Puzzle()
	soup = get_soup(puzzle_date)
	info = xword_info(soup)
	validate_scrape(info)
	for k, v in info.items():
	if k in puzzle.__dict__:
	puzzle.__setattr__(k, v)
	puzzle.fill = re.sub(r"[A-Z]", "-", puzzle.solution)
	# This relies on dictionaries preserving insertion order
	puzzle._extensions_order = list(puzzle.extensions.keys())
	return puzzle


	def main(*argv):
	if len(argv) == 3:
	__, datestring, fname_out = argv
	puzzle_date = datetime.date.fromisoformat(datestring)
	elif len(argv) == 2:
	fname_out = argv[1]
	puzzle_date = None
	else:
	sys.stderr.write(f"usage: {argv[0]} [puzzle-iso-date] file\n")
	sys.stderr.write(f"{argv[0]}: error: the file argument is required\n")
	return 2
	puzzle = get_puzzle(puzzle_date)
	puzzle.save(fname_out)
	print(f"Saved {puzzle.title} to {fname_out}")
	return 0



	def get_unreadable_puzzles(start=datetime.date(1993, 11, 21), delay=1):
	import time
	def daterange(start, stop):
	current = start
	while current < stop:
	yield current
	current += datetime.timedelta(1)
	unreadable = []
	for puzzle_date in daterange(start, datetime.date.today()):
	try:
	get_puzzle(puzzle_date)
	except Exception:
	unreadable.append(puzzle_date)
	print(puzzle_date)
	finally:
	time.sleep(delay)
	return unreadable



	if __name__ == "__main__":
	sys.exit(main(*sys.argv))