Skip to content

Instantly share code, notes, and snippets.

@harryposner
Last active April 11, 2024 21:09
Show Gist options
  • Save harryposner/41313082be84213b33ea6947f5475e4e to your computer and use it in GitHub Desktop.
Save harryposner/41313082be84213b33ea6947f5475e4e to your computer and use it in GitHub Desktop.
Scrape NYT crossword puzzles in AcrossLite format from xwordinfo.com
#!/usr/bin/env python3
"""Scrape NYT crossword puzzles in AcrossLite format from xwordinfo.com
Usage: ./get_nyt_xword.py [puzzle-iso-date] filename
"""
import re
import sys
import datetime
import puz
import requests
from bs4 import BeautifulSoup
URL = "https://www.xwordinfo.com/Crossword"
def get_soup(puzzle_date=None):
datestring = puzzle_date.strftime("%m/%d/%Y") if puzzle_date else None
resp = requests.get(URL, data={"date": datestring})
resp.raise_for_status()
return BeautifulSoup(resp.text, features="html.parser")
def xword_info(soup):
info = {}
info.update(**read_stats(soup))
info["author"] = read_author(soup)
info["title"] = read_title(soup)
info["solution"] = read_solution(soup)
info["clues"] = read_clues(soup)
info["extensions"] = read_extensions(soup)
return info
def read_stats(soup):
stats_elts = soup.find("div", id="CPHContent_StatsData").find_all("span")
stats = {}
for element in stats_elts:
if element.text[-1].isdigit():
for k, v in (pair.split(":") for pair in element.text.split(",")):
k = k.strip()
k = {"Rows": "height", "Columns": "width"}.get(k, k)
stats[k] = int(v.strip())
return stats
def read_title(soup):
return soup.find("h1", id="PuzTitle").text
def read_author(soup):
author_box = soup.find("div", id="CPHContent_AEGrid").text.strip()
for k, v in (line.split(":") for line in author_box.splitlines()):
if k.strip() == "Author":
author = v.strip()
elif k.strip() == "Editor":
editor = v.strip()
return f"{author} / {editor}"
def puzzle_squares(soup):
puzzle_table = soup.find("table", id="PuzTable").find_all("td")
for tag in puzzle_table:
if tag.attrs.get("class") == ["black"]:
yield {"val": puz.BLACKSQUARE, "markup": None}
continue
# `continue` above means `markup` won't ever be a black cell
markup = bool(tag.attrs.get("class") or tag.find("div", style=True))
letter_tag = tag.find("div", class_="letter")
if letter_tag is not None:
yield {"val": letter_tag.text, "markup": markup}
continue
rebus_tag = (tag.find("div", class_="subst2")
or tag.find("div", class_="subst"))
if rebus_tag is not None:
yield {"val": rebus_tag.text, "markup": markup}
continue
raise ValueError(f"Couldn't read puzzle tag: {tag}")
def read_solution(soup):
return "".join(sq["val"][0] for sq in puzzle_squares(soup))
EMPTY = b"\x00"
MARKUP = b"\x80"
def read_extensions(soup):
markup = []
rebus_solutions = []
rebus_locations = []
for square in puzzle_squares(soup):
if len(square["val"]) > 1:
rebus_solutions.append(square["val"])
rebus_locations.append((len(rebus_solutions)+1).to_bytes(1, "big"))
else:
rebus_locations.append(EMPTY)
if square["markup"]:
markup.append(MARKUP)
else:
markup.append(EMPTY)
extensions = {}
if rebus_solutions:
extensions[puz.Extensions.Rebus] = b"".join(rebus_locations)
rebus = [f"{i+1:2}:{word};" for i, word in enumerate(rebus_solutions)]
rebus = "".join(rebus).encode("ascii")
extensions[puz.Extensions.RebusSolutions] = rebus
if len(set(markup)) > 1:
extensions[puz.Extensions.Markup] = b"".join(markup)
return extensions
def read_clues(soup):
def extract_clues(panel):
tag_iter = (t.find(text=True, recursive=False) for t in panel.children)
clues = []
for clue_number in tag_iter:
clue = next(tag_iter).rstrip(" :")
clues.append((int(clue_number), clue))
return clues
across, down = map(extract_clues, soup.find_all("div", class_="numclue"))
# Across comes before down when clues have the same number
all_clues = sorted(across + down, key=lambda num_and_clue: num_and_clue[0])
return [clue for __, clue in all_clues]
def validate_scrape(info):
assert info["Blocks"] == info["solution"].count(puz.BLACKSQUARE)
assert len(info["solution"]) == info["height"] * info["width"]
assert info["Words"] == len(info["clues"])
def get_puzzle(puzzle_date=None):
puzzle = puz.Puzzle()
soup = get_soup(puzzle_date)
info = xword_info(soup)
validate_scrape(info)
for k, v in info.items():
if k in puzzle.__dict__:
puzzle.__setattr__(k, v)
puzzle.fill = re.sub(r"[A-Z]", "-", puzzle.solution)
# This relies on dictionaries preserving insertion order
puzzle._extensions_order = list(puzzle.extensions.keys())
return puzzle
def main(*argv):
if len(argv) == 3:
__, datestring, fname_out = argv
puzzle_date = datetime.date.fromisoformat(datestring)
elif len(argv) == 2:
fname_out = argv[1]
puzzle_date = None
else:
sys.stderr.write(f"usage: {argv[0]} [puzzle-iso-date] file\n")
sys.stderr.write(f"{argv[0]}: error: the file argument is required\n")
return 2
puzzle = get_puzzle(puzzle_date)
puzzle.save(fname_out)
print(f"Saved {puzzle.title} to {fname_out}")
return 0
def get_unreadable_puzzles(start=datetime.date(1993, 11, 21), delay=1):
import time
def daterange(start, stop):
current = start
while current < stop:
yield current
current += datetime.timedelta(1)
unreadable = []
for puzzle_date in daterange(start, datetime.date.today()):
try:
get_puzzle(puzzle_date)
except Exception:
unreadable.append(puzzle_date)
print(puzzle_date)
finally:
time.sleep(delay)
return unreadable
if __name__ == "__main__":
sys.exit(main(*sys.argv))
@mixographer
Copy link

Does this one work on the pre-shortz solutions? Thanks!

@harryposner
Copy link
Author

It doesn't, sorry.

@mixographer
Copy link

On September 22, they used 'en-dashes' instead of regular dashes. which broke the script with "UnicodeEncodeError: 'latin-1' codec can't encode character '\u2013' " Seems like that's in the puz library, not in your code.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment