Last active
May 14, 2022 20:28
-
-
Save porglezomp/3f98c313bbf87b69c5a99302edb61b03 to your computer and use it in GitHub Desktop.
Tell you what pages of dracula happen on a given day.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
from __future__ import annotations | |
import abc | |
import argparse | |
import datetime | |
import enum | |
import re | |
import sys | |
from collections import defaultdict | |
from dataclasses import dataclass | |
from pathlib import Path | |
from typing import Optional | |
# Point this path to your copy of dracula from Project Gutenberg | |
DRACULA_PATH = Path.home() / "Documents" / "Books" / "dracula.txt" | |
# Delimiters for the portion that's the actual book | |
START = "START OF THE PROJECT GUTENBERG EBOOK DRACULA" | |
END = "END OF THE PROJECT GUTENBERG EBOOK DRACULA" | |
PAGE_COUNT = 416 # From my entry in the Reading List app, you can customize it yourself | |
""" | |
Some example dates: | |
_3 May. Bistritz._ | |
_4 May._ | |
_Later: the Morning of 16 May._ | |
"_9 May._ | |
_Hillingham, 24 August._-- | |
_2 August, midnight_. | |
Some dates are multiline: | |
_Written 18 July, things so strange happening, that I shall keep | |
accurate note henceforth till we land._ | |
The ship's log, some of the dates have a different date format: | |
On 6 July | |
Some dates are tricky, but maybe don't matter for the calculation: | |
_Same day, 11 o'clock p. m._ | |
_3-4 October, close to midnight._ (e.g. this gets matched as 4 October) | |
There are very few instances of "Month Day" instead of of "Day Month": | |
_Telegram, October 24th._ | |
_October 30. Night._ | |
""" | |
DATE = re.compile( | |
r""" | |
# All dates seem to start at the beginning of a line, and are surrounded by underscores. | |
# Some dates have quotes around them since they are part of other text | |
^\s*"?_ | |
(?P<prefix>[^_]*?) # Extra information before the date, often location, sometimes time | |
( # Dates are either "day month" (common) or "month day" (uncommon) | |
(?P<day>\d+)\s # The numeric day | |
# The month, written in full: The book starts in May, ends in November. | |
(?P<month>May|June|July|August|September|October|November) | |
| | |
(?P<ordmonth>May|June|July|August|September|October|November)\s | |
(?P<ordday>\d+)(st|nd|rd|th)? # Some of the "month day" dates use ordinal numbers | |
) | |
(?P<suffix>[^_]*) # Any extra description (like ", evening") | |
_ # End of the date | |
| | |
# Some ship's log dates start with "On ", parse those separately | |
^On\s | |
(?P<onday>\d+)\s # The numeric day | |
(?P<onmonth>May|June|July|August|September|October|November) | |
""", | |
re.VERBOSE | re.MULTILINE, | |
) | |
# To cache reading from the file | |
DRACULA_TEXT: Optional[str] = None | |
def dracula_text() -> str: | |
global DRACULA_TEXT | |
if DRACULA_TEXT is None: | |
lines = [] | |
with open(DRACULA_PATH) as f: | |
recording = False | |
for line in f: | |
if END in line: | |
break | |
if recording: | |
lines.append(line.rstrip()) | |
if START in line: | |
recording = True | |
DRACULA_TEXT = "\n".join(lines) | |
return DRACULA_TEXT | |
class OrderedEnum(enum.Enum): | |
def __ge__(self, other): | |
if self.__class__ is other.__class__: | |
return self.ordinal >= other.ordinal | |
return NotImplemented | |
def __gt__(self, other): | |
if self.__class__ is other.__class__: | |
return self.ordinal > other.ordinal | |
return NotImplemented | |
def __le__(self, other): | |
if self.__class__ is other.__class__: | |
return self.ordinal <= other.ordinal | |
return NotImplemented | |
def __lt__(self, other): | |
if self.__class__ is other.__class__: | |
return self.ordinal < other.ordinal | |
return NotImplemented | |
@property | |
@abc.abstractmethod | |
def ordinal(self) -> int: | |
... | |
class Month(OrderedEnum): | |
January = "January" | |
February = "February" | |
March = "March" | |
April = "April" | |
May = "May" | |
June = "June" | |
July = "July" | |
August = "August" | |
September = "September" | |
October = "October" | |
November = "November" | |
December = "December" | |
@staticmethod | |
def parse(text: str) -> Month: | |
cap_text = text.capitalize() | |
months = [month for month in Month if month.value.startswith(cap_text)] | |
match months: | |
case []: | |
raise ValueError(f"No month found for '{text}'") | |
case [month]: | |
return month | |
case [*months]: | |
raise ValueError( | |
f"Ambiguous month for '{text}': {', '.join(month.name for month in months)}" | |
) | |
@property | |
def ordinal(self) -> int: | |
return list(Month).index(self) | |
@dataclass(frozen=True, order=True) | |
class Date: | |
month: Month | |
day: int | |
def __str__(self) -> str: | |
return f"{self.day} {self.month.value}" | |
@staticmethod | |
def parse_list(date: list[str]) -> Date: | |
if len(date) == 1: | |
date = date[0].split() | |
match date: | |
case [day, month] if day.isdigit(): | |
pass | |
case [month, day] if day.isdigit(): | |
pass | |
case _: | |
raise ValueError(f"Unable to parse date: {' '.join(date)}") | |
return Date(Month.parse(month), int(day)) | |
@dataclass(frozen=True) | |
class BookRange: | |
start: int | |
end: int | |
text_length: int | |
@property | |
def page_start(self) -> int: | |
return self.start * PAGE_COUNT // self.text_length | |
@property | |
def page_end(self) -> int: | |
return self.end * PAGE_COUNT // self.text_length | |
@property | |
def range(self) -> str: | |
if self.page_start == self.page_end: | |
return str(self.page_start) | |
return f"{self.page_start}–{self.page_end}" | |
def merge(self, other: BookRange) -> BookRange: | |
assert self.text_length == other.text_length | |
return BookRange( | |
min(self.start, other.start), | |
max(self.end, other.end), | |
text_length=self.text_length, | |
) | |
def scan_dates(text: str) -> dict[Date, list[BookRange]]: | |
# Find all the matching dates in the book | |
matches = list(DATE.finditer(text)) | |
# We want the start of each match, to represent the start of that date. | |
starts = [m.start(0) for m in matches] | |
# The ends of each date are the start of the next date. | |
# The final range ends with the end of the text | |
ends = starts[1:] + [len(text)] | |
# And the actual dates are parsed out: | |
dates = [ | |
Date( | |
day=int(m.group("day") or m.group("ordday") or m.group("onday")), | |
month=Month.parse( | |
m.group("month") or m.group("ordmonth") or m.group("onmonth") | |
), | |
) | |
for m in matches | |
] | |
# We're constructing a dictionary of spans for each date: | |
ranges = defaultdict(list) | |
for start, end, date in zip(starts, ends, dates): | |
ranges[date].append(BookRange(start, end, text_length=len(text))) | |
return ranges | |
def main() -> None: | |
parser = argparse.ArgumentParser() | |
parser.add_argument( | |
"date", nargs="*", help="The date to find in the text. Defaults to today." | |
) | |
args = parser.parse_args() | |
if args.date: | |
date = Date.parse_list(args.date) | |
else: | |
date = datetime.date.today() | |
date = Date(month=Month.parse(date.strftime("%B")), day=date.day) | |
dates = scan_dates(dracula_text()) | |
previous_dates = [(k, v) for k, v in dates.items() if k <= date] | |
if not previous_dates: | |
print(f"No entries found before {date}.") | |
sys.exit(1) | |
found_date, matches = max(previous_dates, key=lambda p: p[0]) | |
if found_date != date: | |
print(f"No entry found on {date}, using the most recent {found_date}.") | |
next_date = min(k for k in dates.keys() if k > date) | |
print(f"Next entry is on {next_date}.") | |
date = found_date | |
def merge_ranges(ranges: list[BookRange]) -> list[BookRange]: | |
merged = [] | |
for r in ranges: | |
if not merged: | |
merged.append(r) | |
continue | |
if r.page_start == merged[-1].page_end: | |
merged[-1] = merged[-1].merge(r) | |
else: | |
merged.append(r) | |
return merged | |
ranges = ", ".join(r.range for r in merge_ranges(matches)) | |
print(f"{date} includes these pages: {ranges}") | |
all_matches = sorted( | |
(m for _, matches in previous_dates for m in matches), | |
key=lambda m: (m.page_start, m.page_end), | |
) | |
all_ranges = merge_ranges(all_matches) | |
if len(all_ranges) > 1: | |
ranges = ", ".join(r.range for r in all_ranges) | |
print(f"Pages read so far: {ranges}") | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment