Skip to content

Instantly share code, notes, and snippets.

@porglezomp
Last active May 14, 2022 20:28
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save porglezomp/3f98c313bbf87b69c5a99302edb61b03 to your computer and use it in GitHub Desktop.
Save porglezomp/3f98c313bbf87b69c5a99302edb61b03 to your computer and use it in GitHub Desktop.
Tell you what pages of dracula happen on a given day.
#!/usr/bin/env python3
from __future__ import annotations
import abc
import argparse
import datetime
import enum
import re
import sys
from collections import defaultdict
from dataclasses import dataclass
from pathlib import Path
from typing import Optional
# Point this path to your copy of dracula from Project Gutenberg
DRACULA_PATH = Path.home() / "Documents" / "Books" / "dracula.txt"
# Delimiters for the portion that's the actual book
START = "START OF THE PROJECT GUTENBERG EBOOK DRACULA"
END = "END OF THE PROJECT GUTENBERG EBOOK DRACULA"
PAGE_COUNT = 416 # From my entry in the Reading List app, you can customize it yourself
"""
Some example dates:
_3 May. Bistritz._
_4 May._
_Later: the Morning of 16 May._
"_9 May._
_Hillingham, 24 August._--
_2 August, midnight_.
Some dates are multiline:
_Written 18 July, things so strange happening, that I shall keep
accurate note henceforth till we land._
The ship's log, some of the dates have a different date format:
On 6 July
Some dates are tricky, but maybe don't matter for the calculation:
_Same day, 11 o'clock p. m._
_3-4 October, close to midnight._ (e.g. this gets matched as 4 October)
There are very few instances of "Month Day" instead of of "Day Month":
_Telegram, October 24th._
_October 30. Night._
"""
DATE = re.compile(
r"""
# All dates seem to start at the beginning of a line, and are surrounded by underscores.
# Some dates have quotes around them since they are part of other text
^\s*"?_
(?P<prefix>[^_]*?) # Extra information before the date, often location, sometimes time
( # Dates are either "day month" (common) or "month day" (uncommon)
(?P<day>\d+)\s # The numeric day
# The month, written in full: The book starts in May, ends in November.
(?P<month>May|June|July|August|September|October|November)
|
(?P<ordmonth>May|June|July|August|September|October|November)\s
(?P<ordday>\d+)(st|nd|rd|th)? # Some of the "month day" dates use ordinal numbers
)
(?P<suffix>[^_]*) # Any extra description (like ", evening")
_ # End of the date
|
# Some ship's log dates start with "On ", parse those separately
^On\s
(?P<onday>\d+)\s # The numeric day
(?P<onmonth>May|June|July|August|September|October|November)
""",
re.VERBOSE | re.MULTILINE,
)
# To cache reading from the file
DRACULA_TEXT: Optional[str] = None
def dracula_text() -> str:
global DRACULA_TEXT
if DRACULA_TEXT is None:
lines = []
with open(DRACULA_PATH) as f:
recording = False
for line in f:
if END in line:
break
if recording:
lines.append(line.rstrip())
if START in line:
recording = True
DRACULA_TEXT = "\n".join(lines)
return DRACULA_TEXT
class OrderedEnum(enum.Enum):
def __ge__(self, other):
if self.__class__ is other.__class__:
return self.ordinal >= other.ordinal
return NotImplemented
def __gt__(self, other):
if self.__class__ is other.__class__:
return self.ordinal > other.ordinal
return NotImplemented
def __le__(self, other):
if self.__class__ is other.__class__:
return self.ordinal <= other.ordinal
return NotImplemented
def __lt__(self, other):
if self.__class__ is other.__class__:
return self.ordinal < other.ordinal
return NotImplemented
@property
@abc.abstractmethod
def ordinal(self) -> int:
...
class Month(OrderedEnum):
January = "January"
February = "February"
March = "March"
April = "April"
May = "May"
June = "June"
July = "July"
August = "August"
September = "September"
October = "October"
November = "November"
December = "December"
@staticmethod
def parse(text: str) -> Month:
cap_text = text.capitalize()
months = [month for month in Month if month.value.startswith(cap_text)]
match months:
case []:
raise ValueError(f"No month found for '{text}'")
case [month]:
return month
case [*months]:
raise ValueError(
f"Ambiguous month for '{text}': {', '.join(month.name for month in months)}"
)
@property
def ordinal(self) -> int:
return list(Month).index(self)
@dataclass(frozen=True, order=True)
class Date:
month: Month
day: int
def __str__(self) -> str:
return f"{self.day} {self.month.value}"
@staticmethod
def parse_list(date: list[str]) -> Date:
if len(date) == 1:
date = date[0].split()
match date:
case [day, month] if day.isdigit():
pass
case [month, day] if day.isdigit():
pass
case _:
raise ValueError(f"Unable to parse date: {' '.join(date)}")
return Date(Month.parse(month), int(day))
@dataclass(frozen=True)
class BookRange:
start: int
end: int
text_length: int
@property
def page_start(self) -> int:
return self.start * PAGE_COUNT // self.text_length
@property
def page_end(self) -> int:
return self.end * PAGE_COUNT // self.text_length
@property
def range(self) -> str:
if self.page_start == self.page_end:
return str(self.page_start)
return f"{self.page_start}–{self.page_end}"
def merge(self, other: BookRange) -> BookRange:
assert self.text_length == other.text_length
return BookRange(
min(self.start, other.start),
max(self.end, other.end),
text_length=self.text_length,
)
def scan_dates(text: str) -> dict[Date, list[BookRange]]:
# Find all the matching dates in the book
matches = list(DATE.finditer(text))
# We want the start of each match, to represent the start of that date.
starts = [m.start(0) for m in matches]
# The ends of each date are the start of the next date.
# The final range ends with the end of the text
ends = starts[1:] + [len(text)]
# And the actual dates are parsed out:
dates = [
Date(
day=int(m.group("day") or m.group("ordday") or m.group("onday")),
month=Month.parse(
m.group("month") or m.group("ordmonth") or m.group("onmonth")
),
)
for m in matches
]
# We're constructing a dictionary of spans for each date:
ranges = defaultdict(list)
for start, end, date in zip(starts, ends, dates):
ranges[date].append(BookRange(start, end, text_length=len(text)))
return ranges
def main() -> None:
parser = argparse.ArgumentParser()
parser.add_argument(
"date", nargs="*", help="The date to find in the text. Defaults to today."
)
args = parser.parse_args()
if args.date:
date = Date.parse_list(args.date)
else:
date = datetime.date.today()
date = Date(month=Month.parse(date.strftime("%B")), day=date.day)
dates = scan_dates(dracula_text())
previous_dates = [(k, v) for k, v in dates.items() if k <= date]
if not previous_dates:
print(f"No entries found before {date}.")
sys.exit(1)
found_date, matches = max(previous_dates, key=lambda p: p[0])
if found_date != date:
print(f"No entry found on {date}, using the most recent {found_date}.")
next_date = min(k for k in dates.keys() if k > date)
print(f"Next entry is on {next_date}.")
date = found_date
def merge_ranges(ranges: list[BookRange]) -> list[BookRange]:
merged = []
for r in ranges:
if not merged:
merged.append(r)
continue
if r.page_start == merged[-1].page_end:
merged[-1] = merged[-1].merge(r)
else:
merged.append(r)
return merged
ranges = ", ".join(r.range for r in merge_ranges(matches))
print(f"{date} includes these pages: {ranges}")
all_matches = sorted(
(m for _, matches in previous_dates for m in matches),
key=lambda m: (m.page_start, m.page_end),
)
all_ranges = merge_ranges(all_matches)
if len(all_ranges) > 1:
ranges = ", ".join(r.range for r in all_ranges)
print(f"Pages read so far: {ranges}")
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment