porglezomp/dracula.py

## dracula.py
#!/usr/bin/env python3

from __future__ import annotations

import abc
import argparse
import datetime
import enum
import re
import sys
from collections import defaultdict
from dataclasses import dataclass
from pathlib import Path
from typing import Optional

# Point this path to your copy of dracula from Project Gutenberg
DRACULA_PATH = Path.home() / "Documents" / "Books" / "dracula.txt"
# Delimiters for the portion that's the actual book
START = "START OF THE PROJECT GUTENBERG EBOOK DRACULA"
END = "END OF THE PROJECT GUTENBERG EBOOK DRACULA"

PAGE_COUNT = 416  # From my entry in the Reading List app, you can customize it yourself

"""
Some example dates:

_3 May. Bistritz._
_4 May._
_Later: the Morning of 16 May._
"_9 May._
_Hillingham, 24 August._--
_2 August, midnight_.

Some dates are multiline:

_Written 18 July, things so strange happening, that I shall keep
accurate note henceforth till we land._

The ship's log, some of the dates have a different date format:

On 6 July

Some dates are tricky, but maybe don't matter for the calculation:

_Same day, 11 o'clock p. m._
_3-4 October, close to midnight._  (e.g. this gets matched as 4 October)

There are very few instances of "Month Day" instead of of "Day Month":

_Telegram, October 24th._
_October 30. Night._
"""
DATE = re.compile(
    r"""
# All dates seem to start at the beginning of a line, and are surrounded by underscores.
# Some dates have quotes around them since they are part of other text
^\s*"?_
(?P<prefix>[^_]*?)    # Extra information before the date, often location, sometimes time
( # Dates are either "day month" (common) or "month day" (uncommon)
(?P<day>\d+)\s        # The numeric day
# The month, written in full: The book starts in May, ends in November.
(?P<month>May|June|July|August|September|October|November)
|
(?P<ordmonth>May|June|July|August|September|October|November)\s
(?P<ordday>\d+)(st|nd|rd|th)?  # Some of the "month day" dates use ordinal numbers
)
(?P<suffix>[^_]*)    # Any extra description (like ", evening")
_                    # End of the date
|
# Some ship's log dates start with "On ", parse those separately
^On\s
(?P<onday>\d+)\s        # The numeric day
(?P<onmonth>May|June|July|August|September|October|November)
""",
    re.VERBOSE | re.MULTILINE,
)


# To cache reading from the file
DRACULA_TEXT: Optional[str] = None


def dracula_text() -> str:
    global DRACULA_TEXT
    if DRACULA_TEXT is None:
        lines = []
        with open(DRACULA_PATH) as f:
            recording = False
            for line in f:
                if END in line:
                    break
                if recording:
                    lines.append(line.rstrip())
                if START in line:
                    recording = True
        DRACULA_TEXT = "\n".join(lines)
    return DRACULA_TEXT


class OrderedEnum(enum.Enum):
    def __ge__(self, other):
        if self.__class__ is other.__class__:
            return self.ordinal >= other.ordinal
        return NotImplemented

    def __gt__(self, other):
        if self.__class__ is other.__class__:
            return self.ordinal > other.ordinal
        return NotImplemented

    def __le__(self, other):
        if self.__class__ is other.__class__:
            return self.ordinal <= other.ordinal
        return NotImplemented

    def __lt__(self, other):
        if self.__class__ is other.__class__:
            return self.ordinal < other.ordinal
        return NotImplemented

    @property
    @abc.abstractmethod
    def ordinal(self) -> int:
        ...


class Month(OrderedEnum):
    January = "January"
    February = "February"
    March = "March"
    April = "April"
    May = "May"
    June = "June"
    July = "July"
    August = "August"
    September = "September"
    October = "October"
    November = "November"
    December = "December"

    @staticmethod
    def parse(text: str) -> Month:
        cap_text = text.capitalize()
        months = [month for month in Month if month.value.startswith(cap_text)]
        match months:
            case []:
                raise ValueError(f"No month found for '{text}'")
            case [month]:
                return month
            case [*months]:
                raise ValueError(
                    f"Ambiguous month for '{text}': {', '.join(month.name for month in months)}"
                )

    @property
    def ordinal(self) -> int:
        return list(Month).index(self)


@dataclass(frozen=True, order=True)
class Date:
    month: Month
    day: int

    def __str__(self) -> str:
        return f"{self.day} {self.month.value}"

    @staticmethod
    def parse_list(date: list[str]) -> Date:
        if len(date) == 1:
            date = date[0].split()
        match date:
            case [day, month] if day.isdigit():
                pass
            case [month, day] if day.isdigit():
                pass
            case _:
                raise ValueError(f"Unable to parse date: {' '.join(date)}")
        return Date(Month.parse(month), int(day))


@dataclass(frozen=True)
class BookRange:
    start: int
    end: int
    text_length: int

    @property
    def page_start(self) -> int:
        return self.start * PAGE_COUNT // self.text_length

    @property
    def page_end(self) -> int:
        return self.end * PAGE_COUNT // self.text_length

    @property
    def range(self) -> str:
        if self.page_start == self.page_end:
            return str(self.page_start)
        return f"{self.page_start}–{self.page_end}"

    def merge(self, other: BookRange) -> BookRange:
        assert self.text_length == other.text_length
        return BookRange(
            min(self.start, other.start),
            max(self.end, other.end),
            text_length=self.text_length,
        )


def scan_dates(text: str) -> dict[Date, list[BookRange]]:
    # Find all the matching dates in the book
    matches = list(DATE.finditer(text))
    # We want the start of each match, to represent the start of that date.
    starts = [m.start(0) for m in matches]
    # The ends of each date are the start of the next date.
    # The final range ends with the end of the text
    ends = starts[1:] + [len(text)]
    # And the actual dates are parsed out:
    dates = [
        Date(
            day=int(m.group("day") or m.group("ordday") or m.group("onday")),
            month=Month.parse(
                m.group("month") or m.group("ordmonth") or m.group("onmonth")
            ),
        )
        for m in matches
    ]

    # We're constructing a dictionary of spans for each date:
    ranges = defaultdict(list)
    for start, end, date in zip(starts, ends, dates):
        ranges[date].append(BookRange(start, end, text_length=len(text)))
    return ranges


def main() -> None:
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "date", nargs="*", help="The date to find in the text. Defaults to today."
    )

    args = parser.parse_args()
    if args.date:
        date = Date.parse_list(args.date)
    else:
        date = datetime.date.today()
        date = Date(month=Month.parse(date.strftime("%B")), day=date.day)
    dates = scan_dates(dracula_text())

    previous_dates = [(k, v) for k, v in dates.items() if k <= date]
    if not previous_dates:
        print(f"No entries found before {date}.")
        sys.exit(1)

    found_date, matches = max(previous_dates, key=lambda p: p[0])
    if found_date != date:
        print(f"No entry found on {date}, using the most recent {found_date}.")
        next_date = min(k for k in dates.keys() if k > date)
        print(f"Next entry is on {next_date}.")
        date = found_date

    def merge_ranges(ranges: list[BookRange]) -> list[BookRange]:
        merged = []
        for r in ranges:
            if not merged:
                merged.append(r)
                continue
            if r.page_start == merged[-1].page_end:
                merged[-1] = merged[-1].merge(r)
            else:
                merged.append(r)
        return merged

    ranges = ", ".join(r.range for r in merge_ranges(matches))
    print(f"{date} includes these pages: {ranges}")

    all_matches = sorted(
        (m for _, matches in previous_dates for m in matches),
        key=lambda m: (m.page_start, m.page_end),
    )
    all_ranges = merge_ranges(all_matches)
    if len(all_ranges) > 1:
        ranges = ", ".join(r.range for r in all_ranges)
        print(f"Pages read so far: {ranges}")


if __name__ == "__main__":
    main()
	#!/usr/bin/env python3

	from __future__ import annotations

	import abc
	import argparse
	import datetime
	import enum
	import re
	import sys
	from collections import defaultdict
	from dataclasses import dataclass
	from pathlib import Path
	from typing import Optional

	# Point this path to your copy of dracula from Project Gutenberg
	DRACULA_PATH = Path.home() / "Documents" / "Books" / "dracula.txt"
	# Delimiters for the portion that's the actual book
	START = "START OF THE PROJECT GUTENBERG EBOOK DRACULA"
	END = "END OF THE PROJECT GUTENBERG EBOOK DRACULA"

	PAGE_COUNT = 416 # From my entry in the Reading List app, you can customize it yourself

	"""
	Some example dates:

	_3 May. Bistritz._
	_4 May._
	_Later: the Morning of 16 May._
	"_9 May._
	_Hillingham, 24 August._--
	_2 August, midnight_.

	Some dates are multiline:

	_Written 18 July, things so strange happening, that I shall keep
	accurate note henceforth till we land._

	The ship's log, some of the dates have a different date format:

	On 6 July

	Some dates are tricky, but maybe don't matter for the calculation:

	_Same day, 11 o'clock p. m._
	_3-4 October, close to midnight._ (e.g. this gets matched as 4 October)

	There are very few instances of "Month Day" instead of of "Day Month":

	_Telegram, October 24th._
	_October 30. Night._
	"""
	DATE = re.compile(
	r"""
	# All dates seem to start at the beginning of a line, and are surrounded by underscores.
	# Some dates have quotes around them since they are part of other text
	^\s*"?_
	(?P<prefix>[^_]*?) # Extra information before the date, often location, sometimes time
	( # Dates are either "day month" (common) or "month day" (uncommon)
	(?P<day>\d+)\s # The numeric day
	# The month, written in full: The book starts in May, ends in November.
	(?P<month>May\|June\|July\|August\|September\|October\|November)
	\|
	(?P<ordmonth>May\|June\|July\|August\|September\|October\|November)\s
	(?P<ordday>\d+)(st\|nd\|rd\|th)? # Some of the "month day" dates use ordinal numbers
	)
	(?P<suffix>[^_]*) # Any extra description (like ", evening")
	_ # End of the date
	\|
	# Some ship's log dates start with "On ", parse those separately
	^On\s
	(?P<onday>\d+)\s # The numeric day
	(?P<onmonth>May\|June\|July\|August\|September\|October\|November)
	""",
	re.VERBOSE \| re.MULTILINE,
	)


	# To cache reading from the file
	DRACULA_TEXT: Optional[str] = None


	def dracula_text() -> str:
	global DRACULA_TEXT
	if DRACULA_TEXT is None:
	lines = []
	with open(DRACULA_PATH) as f:
	recording = False
	for line in f:
	if END in line:
	break
	if recording:
	lines.append(line.rstrip())
	if START in line:
	recording = True
	DRACULA_TEXT = "\n".join(lines)
	return DRACULA_TEXT


	class OrderedEnum(enum.Enum):
	def __ge__(self, other):
	if self.__class__ is other.__class__:
	return self.ordinal >= other.ordinal
	return NotImplemented

	def __gt__(self, other):
	if self.__class__ is other.__class__:
	return self.ordinal > other.ordinal
	return NotImplemented

	def __le__(self, other):
	if self.__class__ is other.__class__:
	return self.ordinal <= other.ordinal
	return NotImplemented

	def __lt__(self, other):
	if self.__class__ is other.__class__:
	return self.ordinal < other.ordinal
	return NotImplemented

	@property
	@abc.abstractmethod
	def ordinal(self) -> int:
	...


	class Month(OrderedEnum):
	January = "January"
	February = "February"
	March = "March"
	April = "April"
	May = "May"
	June = "June"
	July = "July"
	August = "August"
	September = "September"
	October = "October"
	November = "November"
	December = "December"

	@staticmethod
	def parse(text: str) -> Month:
	cap_text = text.capitalize()
	months = [month for month in Month if month.value.startswith(cap_text)]
	match months:
	case []:
	raise ValueError(f"No month found for '{text}'")
	case [month]:
	return month
	case [*months]:
	raise ValueError(
	f"Ambiguous month for '{text}': {', '.join(month.name for month in months)}"
	)

	@property
	def ordinal(self) -> int:
	return list(Month).index(self)


	@dataclass(frozen=True, order=True)
	class Date:
	month: Month
	day: int

	def __str__(self) -> str:
	return f"{self.day} {self.month.value}"

	@staticmethod
	def parse_list(date: list[str]) -> Date:
	if len(date) == 1:
	date = date[0].split()
	match date:
	case [day, month] if day.isdigit():
	pass
	case [month, day] if day.isdigit():
	pass
	case _:
	raise ValueError(f"Unable to parse date: {' '.join(date)}")
	return Date(Month.parse(month), int(day))


	@dataclass(frozen=True)
	class BookRange:
	start: int
	end: int
	text_length: int

	@property
	def page_start(self) -> int:
	return self.start * PAGE_COUNT // self.text_length

	@property
	def page_end(self) -> int:
	return self.end * PAGE_COUNT // self.text_length

	@property
	def range(self) -> str:
	if self.page_start == self.page_end:
	return str(self.page_start)
	return f"{self.page_start}–{self.page_end}"

	def merge(self, other: BookRange) -> BookRange:
	assert self.text_length == other.text_length
	return BookRange(
	min(self.start, other.start),
	max(self.end, other.end),
	text_length=self.text_length,
	)


	def scan_dates(text: str) -> dict[Date, list[BookRange]]:
	# Find all the matching dates in the book
	matches = list(DATE.finditer(text))
	# We want the start of each match, to represent the start of that date.
	starts = [m.start(0) for m in matches]
	# The ends of each date are the start of the next date.
	# The final range ends with the end of the text
	ends = starts[1:] + [len(text)]
	# And the actual dates are parsed out:
	dates = [
	Date(
	day=int(m.group("day") or m.group("ordday") or m.group("onday")),
	month=Month.parse(
	m.group("month") or m.group("ordmonth") or m.group("onmonth")
	),
	)
	for m in matches
	]

	# We're constructing a dictionary of spans for each date:
	ranges = defaultdict(list)
	for start, end, date in zip(starts, ends, dates):
	ranges[date].append(BookRange(start, end, text_length=len(text)))
	return ranges


	def main() -> None:
	parser = argparse.ArgumentParser()
	parser.add_argument(
	"date", nargs="*", help="The date to find in the text. Defaults to today."
	)

	args = parser.parse_args()
	if args.date:
	date = Date.parse_list(args.date)
	else:
	date = datetime.date.today()
	date = Date(month=Month.parse(date.strftime("%B")), day=date.day)
	dates = scan_dates(dracula_text())

	previous_dates = [(k, v) for k, v in dates.items() if k <= date]
	if not previous_dates:
	print(f"No entries found before {date}.")
	sys.exit(1)

	found_date, matches = max(previous_dates, key=lambda p: p[0])
	if found_date != date:
	print(f"No entry found on {date}, using the most recent {found_date}.")
	next_date = min(k for k in dates.keys() if k > date)
	print(f"Next entry is on {next_date}.")
	date = found_date

	def merge_ranges(ranges: list[BookRange]) -> list[BookRange]:
	merged = []
	for r in ranges:
	if not merged:
	merged.append(r)
	continue
	if r.page_start == merged[-1].page_end:
	merged[-1] = merged[-1].merge(r)
	else:
	merged.append(r)
	return merged

	ranges = ", ".join(r.range for r in merge_ranges(matches))
	print(f"{date} includes these pages: {ranges}")

	all_matches = sorted(
	(m for _, matches in previous_dates for m in matches),
	key=lambda m: (m.page_start, m.page_end),
	)
	all_ranges = merge_ranges(all_matches)
	if len(all_ranges) > 1:
	ranges = ", ".join(r.range for r in all_ranges)
	print(f"Pages read so far: {ranges}")


	if __name__ == "__main__":
	main()