maldevide/epub-process.ipynb

## epub-process.ipynb
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "from bs4 import BeautifulSoup, NavigableString, Tag\n",
    "import ebooklib\n",
    "from ebooklib import epub\n",
    "import os\n",
    "import re\n",
    "from typing import Generator, List\n",
    "\n",
    "def parse_ebook_html(ebook_path: str, try_chapter : bool = False) -> Generator[tuple, None, None]:\n",
    "    \"\"\"\n",
    "    Parses the HTML content of an EPUB file, yielding only text content from each <p> block,\n",
    "    while skipping specific elements with class 'calibre3' but considering valid text that follows.\n",
    "\n",
    "    Parameters:\n",
    "    - ebook_path (str): The path to the EPUB file.\n",
    "    - try_chapter (bool): If True, the first paragraph of each chapter will be used to determine the chapter title.\n",
    "\n",
    "    Returns:\n",
    "    - text_generator (Generator[tuple, None, None]): A generator yielding text content.\n",
    "    \"\"\"\n",
    "    book = epub.read_epub(ebook_path)\n",
    "    basename = os.path.basename(ebook_path)\n",
    "    noext = os.path.splitext(basename)[0]\n",
    "    chapter_idx = 0\n",
    "    paragraph_idx = 0\n",
    "    cumsum_word_count = 0\n",
    "    for item in book.get_items_of_type(ebooklib.ITEM_DOCUMENT):\n",
    "        content = item.get_content().decode('utf-8')\n",
    "        results = list(html_tokenizer(content, try_chapter))\n",
    "        if len(results) == 0:\n",
    "            continue\n",
    "        chapter_idx += 1\n",
    "        for row in results:\n",
    "            if len(row[1]) == 0:\n",
    "                continue\n",
    "            paragraph_idx += 1\n",
    "            word_count = len((row[1]))\n",
    "            cumsum_word_count += word_count\n",
    "            row = [noext, paragraph_idx, chapter_idx] + list(row[:]) + [word_count, cumsum_word_count]\n",
    "            yield tuple(row)\n",
    "\n",
    "def html_tokenizer(html_content: str, try_chapter) -> Generator[tuple, None, None]:\n",
    "    \"\"\"\n",
    "    Generator function to tokenize HTML content, yielding text content from each <p> block.\n",
    "\n",
    "    Parameters:\n",
    "    - html_content (str): The HTML content to be tokenized.\n",
    "    - try_chapter (bool): If True, the first paragraph of each chapter will be used to determine the chapter title.\n",
    "\n",
    "    Yields:\n",
    "    - text_generator (Generator[tuple, None, None]): A generator yielding text content. \n",
    "    \"\"\"\n",
    "    soup = BeautifulSoup(html_content, 'html.parser')\n",
    "    fix_quote = re.compile(r'“|”|»|«')\n",
    "    fix_threedot = re.compile(r'…')\n",
    "    fix_bars = re.compile(r'\\|\\s*\\|')\n",
    "    fix_spaces = re.compile(r'\\s+')\n",
    "\n",
    "    def extract_and_yield_text(element, accumulated_texts: List[str]):\n",
    "        if isinstance(element, NavigableString):\n",
    "            accumulated_texts.append(str(element))\n",
    "        elif isinstance(element, Tag):\n",
    "            if element.name == 'a' and 'calibre3' in element.get('class', []):\n",
    "                # Skip processing the <a class=\"calibre3\"> tag itself, but not its siblings\n",
    "                #print('skipping', element)\n",
    "                return\n",
    "            if element.name == 'span' and 'italic' in element.get('class', []):\n",
    "                # Append italic text directly to the accumulated_texts list without yielding\n",
    "                accumulated_texts.append(element.get_text())\n",
    "            else:\n",
    "                # Recursively process all children, including those following skipped elements\n",
    "                for child in element.children:\n",
    "                    extract_and_yield_text(child, accumulated_texts)\n",
    "\n",
    "    chapter = None\n",
    "    for i, p_tag in enumerate(soup.find_all('p')):\n",
    "        accumulated_texts = []\n",
    "        # if p's class is calibre14, skip it because it's metadata\n",
    "        if 'calibre14' in p_tag.get('class', []):\n",
    "            #print('skipping', i)\n",
    "            #continue\n",
    "            pass\n",
    "        else:\n",
    "            #print('processing', i)\n",
    "            if i == 0 and try_chapter:\n",
    "                # Instead of processing, this contains our chapter and title\n",
    "                markers = []\n",
    "                for span in p_tag.find_all('span', class_='bold'):\n",
    "                    markers.append(span.get_text())\n",
    "\n",
    "                if len(markers) >= 2:\n",
    "                    chapter = ' '.join(markers)\n",
    "                continue\n",
    "                    \n",
    "        extract_and_yield_text(p_tag, accumulated_texts)\n",
    "        # if our text is '| |', skip it\n",
    "        if '| |' in ' '.join(accumulated_texts):\n",
    "            continue\n",
    "        text = ' '.join([text.strip() for text in accumulated_texts if text.strip()])\n",
    "        text = text.replace('\\n', ' ')\n",
    "        text = text.replace(u'\\xa0', u' ')\n",
    "        text = fix_quote.sub(u'\"', text)\n",
    "        text = fix_threedot.sub(u'...', text)\n",
    "        text = fix_bars.sub(u'', text)\n",
    "        text = fix_spaces.sub(u' ', text)\n",
    "        text = text.strip()\n",
    "        if text.find('Oceano') != -1:\n",
    "            continue\n",
    "        # If the first character is a capital letter, then a space, followed by more capital letters, it is likely the beginning of a chapter and needs to have the space removed\n",
    "        if len(text) == 0:\n",
    "            continue\n",
    "        if len(text) < 4 and text.isnumeric():\n",
    "            continue\n",
    "        #elif len(text) > 2 and text[0].isupper() and text[1] == ' ' and text[2].isupper():\n",
    "        #    text = text[0] + text[2:]\n",
    "        yield chapter, text\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Saved 615508 paragraphs to swords.parquet\n"
     ]
    }
   ],
   "source": [
    "from glob import glob\n",
    "import pandas as pd\n",
    "from IPython.display import clear_output\n",
    "import IPython.utils\n",
    "\n",
    "# special rules.\n",
    "# cryptonomicon requires try_chapter=True, and needs '| |' removed\n",
    "special = {\n",
    "    'a_clash_of_kings': {'try_chapter': True, 'drop': [1,2,76,75,74,73,72]},\n",
    "    'a_court_of_frost_and_starlight' : {'drop': [1,2,3,34,35,36,37]},\n",
    "    'a_court_of_mist_and_fury': {'try_chapter': True, 'drop': [1,2,3,81,80,79,78,77,76,75]},\n",
    "    'a_court_of_silver_flames': {'try_chapter': True, 'drop': [1,2,3,4,90,89]},\n",
    "    'a_court_of_thorns_and_roses': { 'drop': [1,2, 52,51,50,49]},\n",
    "    'a_court_of_wings_and_ruin': {'try_chapter': True, 'drop': [1,2,3,4,5,94,93,92,91,90]},\n",
    "    'a_crown_of_swords': {'try_chapter': True, 'drop': [1,2,3,4,5,52,51,50,49,48,47]},\n",
    "    'a_dance_with_dragons': { 'drop': [1,2,3,4,5,105,104,103,102,101,100,99,98,97,96,95,94,93,92,91,90,89,88,87,86,85,84,83,82,81,80,79,78]},\n",
    "    'a_feast_for_crows': {'try_chapter': True, 'drop': [1,2,3,4,60,59,58,57,56,55,54,53,52,51,50]},\n",
    "    'a_game_of_thrones': {'try_chapter': False, 'drop': [1,2,3,85,84,83,82,81,80,79,78,77,76], 'clip': [(6808, 6811)]},\n",
    "    'a_memory_of_light': { 'drop': [], 'clip': [(11168, 11236)]},\n",
    "    'a_storm_of_swords': {'try_chapter': True, 'drop': [1,2,93,92,91,90,89,88,87,86,85]},\n",
    "    'a_time_of_blood': {'try_chapter': False, 'drop': [1,2,3,69,68,62,63,64,65,66,67]},\n",
    "    'a_time_of_courage': {'try_chapter': False, 'drop': [1,2,3,4,133,132,131,130,129,128,127], 'clip': [(7539, 7539)]},\n",
    "    'a_time_of_dread': {'try_chapter': False, 'drop': [1,2,3,4,5,61,60,59,58], 'clip': [(4942, 4943)]},\n",
    "    'a_wizard_of_earthsea': {'try_chapter': False, 'drop': [1,2,3], 'clip': [(1041, 1041)]},\n",
    "    'alice_in_wonderland': {'try_chapter': False, 'drop': [1,2,3]},\n",
    "    'avempartha': {'try_chapter': True, 'drop': [1], 'clip': [(3494, 3533)]},\n",
    "    'baptism_of_fire': {'try_chapter': False, 'drop': [1,2,3,4,14,15,16,17]},\n",
    "    'blackflame': {'try_chapter': False, 'drop': [1,2,27,26], 'clip': [(3710, 3712)]},\n",
    "    'blade_of_tyshalle': {'try_chapter': False, 'drop': [1,2,33,32], 'clip': [(9070, 9071)]},\n",
    "    'blood_of_elves': {'try_chapter': False, 'drop': [1,2]},\n",
    "    'blood_of_the_fold': {'try_chapter': False, 'drop': [56], 'clip': [(7072, 7073)]},\n",
    "    'bloodline': {'try_chapter': False, 'drop': [1,2,26], 'clip': [(4559, 4644)]},\n",
    "    'caine_black_knife': {'try_chapter': True, 'drop': [1,2,3,4,5,28,29,30], 'clip': [(5852, 5861)]},\n",
    "    'chains_of_gaia': {'try_chapter': True, 'drop': [1,2,3,4], 'clip': [(4162, 4165)]},\n",
    "    'crossroads_of_twilight': {'try_chapter': False, 'drop': [], 'clip': [(2529, 2692)], 'strip_nl': True},\n",
    "    'crown_conspiracy': {'try_chapter': False, 'drop': []},\n",
    "    'death_masks': {'try_chapter': False, 'drop': []},\n",
    "    'dragon_reborn': {'try_chapter': False, 'drop': [1,2,3,4,5,6,65], 'clip': [(4818, 4999)]},\n",
    "    'dragons_of_autumn_twilight': {'try_chapter': False, 'drop': [1,2], 'stip_nl': True},\n",
    "    'dragons_of_spring_dawning': {'try_chapter': False, 'drop': [1,2,3,4,42], 'clip': [(3552, 3556)]},\n",
    "    'dragons_of_winter_night': {'try_chapter': False, 'drop': [1,2,3,4], 'clip': [(3865, 3869)]},\n",
    "    'eye_of_the_world_the': {'try_chapter': False, 'drop': [1,2,3,4,5,61,60], 'clip': [(5955, 6211)]},\n",
    "    'fellowship_of_the_ring': {'try_chapter': False, 'drop': [], 'clip': [(3810, 3811)]},\n",
    "    'ghostwater': {'try_chapter': False, 'drop': [1,2,3,24], 'clip': [(3275, 3299)]},\n",
    "    'good_omens': {'try_chapter': False, 'drop': [1,2,3,4], 'clip': [(4616, 4814)]},\n",
    "    'graceling': {'try_chapter': True, 'drop': [1,2,3,4,5,6,7,50], 'clip': [(3647, 3650)]},\n",
    "    'great_hunt': {'try_chapter': True, 'drop': [55,56,57,58,59,60], 'clip': [(5241, 5535)]},\n",
    "    'heir_of_novron': {'try_chapter': False, 'drop': [1,2,3,4,5,59,60,61]},\n",
    "    'heroes_die': {'try_chapter': True, 'drop': [1,2,3,4,5], 'clip': [(6451, 6451)]},\n",
    "    'isle_of_winds': {'try_chapter': True, 'drop': [1,2,3,4], 'clip': [(2990, 2993)]},\n",
    "    'knife_of_dreams': {'try_chapter': False, 'drop': [1,2,3,4,5,6,7,8,9,49], 'clip': [(4436, 4521)]},\n",
    "    'lady_of_avalon': {'try_chapter': False, 'drop': [1,2,3,4,5,6,39,40], 'clip': [(3812, 3812)]},\n",
    "    'lion_the_witch_and_the_wardrobe': {'try_chapter': False, 'drop': [1]},\n",
    "    'lord_of_chaos': {'try_chapter': True, 'drop': [1]},\n",
    "    'malice': {'try_chapter': True, 'drop': [1,2,3,4,5,96], 'clip': [(7728, 7823)]},\n",
    "    'mistborn_the_final_empire': {'try_chapter': True, 'drop': [1,2,3,4,43,44,45], 'clip': [(7687, 7726)]},\n",
    "    'mists_of_avalon': {'try_chapter': True, 'drop': [1,2,3,4,5,74,75,76,77]},\n",
    "    'nyphron_rising': {'try_chapter': False, 'drop': [1,2], 'clip': [(3473, 3482)], 'strip_nl': True},\n",
    "    'oathbringer': {'try_chapter': True, 'drop': [1,2,3,4,5,6]},\n",
    "    'prince_caspian': {'try_chapter': False, 'drop': []},\n",
    "    'ravens_of_avalon': {'try_chapter': False, 'drop': [1,6]},\n",
    "    'reaper': {'try_chapter': False, 'drop': [1,2,31], 'clip': [(5290, 5315)]},\n",
    "    'red_seas_under_red_skies': {'try_chapter': True, 'drop': [1,2,3,26,27,28,29,30,31], 'clip': [(7146, 7361)]},\n",
    "    'return_of_the_king': {'try_chapter': False, 'drop': [1,2,3,4,5,6,7,8,9,10,119,120,121,122,123,124], 'clip': [(4895, 5083)]},\n",
    "    'rise_of_empire': {'try_chapter': False, 'drop': [1,2,3,4,5,6,51,52,53]},\n",
    "    'ruin_us': {'try_chapter': False, 'drop': [1,2,3,4,5,6,102,103,104,105,106,107]},\n",
    "    'skysworn': {'try_chapter': False, 'drop': [1,2,3,4,5,22], 'clip': [(2875, 2903)]},\n",
    "    'soulsmith': {'try_chapter': True, 'drop': [1,2,3,4,25,26], 'clip': [(2573, 2574)]},\n",
    "    'stone_of_tears': {'try_chapter': False, 'drop': [1]},\n",
    "    'storm_front': {'try_chapter': True, 'drop': [1,2], 'clip': [(2208, 2234)]},\n",
    "    'summer_knight': {'try_chapter': False, 'drop': []},\n",
    "    'sword_of_destiny': {'try_chapter': False, 'drop': [1]},\n",
    "    'test_of_the_twins': {'try_chapter': False, 'drop': [1,2,3,4,5,44,45], 'clip': [(2673, 2677)], 'strip_nl': True},\n",
    "    'the_chainfire': {'try_chapter': False, 'drop': [1]},\n",
    "    'the_confessor': {'try_chapter': False, 'drop': [1], 'clip': [(6800, 6800)]},\n",
    "    'the_crown_tower': {'try_chapter': False, 'drop': [1,2,3,4,5,6,7,8,32,33,34,31]},\n",
    "    'the_drowned_tomb': {'try_chapter': True, 'drop': [1,2,3,4], 'clip': [(3342, 3345)]},\n",
    "    'the_emerald_storm': {'try_chapter': False, 'drop': [1]},\n",
    "    'the_exercise_of_vital_powers': {'try_chapter': False, 'drop': [1,2,3,4,5,6,30,31], 'clip': [(3785, 3788)]},\n",
    "    'the_faith_of_the_fallen': {'try_chapter': False, 'drop': [1]},\n",
    "    'the_fires_of_heaven': {'try_chapter': True, 'drop': [1]},\n",
    "    'the_forest_house': {'try_chapter': False, 'drop': [1,2,3,4,5,6,7,39,40]},\n",
    "    'the_gathering_storm': {'try_chapter': False, 'drop': [1,2,3,4,5,6,7,64,65], 'clip': [(7915, 8006)]},\n",
    "    'the_guns_of_empire': {'try_chapter': False, 'drop': [1,2,3,4,5]},\n",
    "    'the_hero_of_ages': {'try_chapter': False, 'drop': [1,2,3,4,5,6,7,99,100,101]},\n",
    "    'the_hobbit_or_there_and_back_again': {'try_chapter': False, 'drop': [1,24,25,26,27,28], 'clip': [(1845, 1970)], 'strip_nl': True},\n",
    "    'the_horse_and_his_boy': {'try_chapter': False, 'drop': [1,2,23,24], 'clip': [(940, 964)]},\n",
    "    'the_infernal_battalion': {'try_chapter': False, 'drop': [1,2,3,45,46,47,48], 'clip': [(6135, 6166)]},\n",
    "    'the_lady_of_the_lake': {'try_chapter': False, 'drop': [1,2]},\n",
    "    'the_last_battle': {'try_chapter': False, 'drop': [1,2,19,20,21,22,23], 'clip': [(940, 964)]},\n",
    "    'the_last_wish': {'try_chapter': False, 'drop': [1]},\n",
    "    'the_lies_of_locke_lamora': {'try_chapter': False, 'drop': [1], 'clip': [(4971, 4984)]},\n",
    "    'the_mists_of_avalon': {'try_chapter': False, 'drop': [1], 'clip': [(7834, 7835)]},\n",
    "    'the_naked_empire': {'try_chapter': False, 'drop': [1]},\n",
    "    'the_path_of_daggers': {'try_chapter': True, 'drop': [1,2,3,4,5,6,39,40], 'clip': [(2867, 2907)]},\n",
    "    'the_phantom': {'try_chapter': False, 'drop': [1,2], 'clip': [(5954, 5955)]},\n",
    "    'the_price_of_valor': {'try_chapter': True, 'drop': [1,2,3,4,40], 'clip': [(6195, 6237)]},\n",
    "    'the_princess_bride': {'try_chapter': False, 'drop': [1,2,3]},\n",
    "    'the_republic_of_thieves': {'try_chapter': False, 'drop': [1,2,175,176,177], 'clip': [(7812, 7812)]},\n",
    "    'the_rose_and_the_thorn': {'try_chapter': False, 'drop': [1,27,28,29], 'clip': [(3373, 3462)]},\n",
    "    'the_shadow_of_the_gods': {'try_chapter': False, 'drop': [1,2,57,58,59], 'clip': [(5088, 5095)]},\n",
    "    'the_shadow_rising': {'try_chapter': False, 'drop': [1,2], 'clip': [(6737, 6755)]},\n",
    "    'the_shadow_throne': {'try_chapter': True, 'drop': [1,2,3,4,5]},\n",
    "    'the_silver_chair': {'try_chapter': True, 'drop': [1]},\n",
    "    'the_slow_regard_of_silent_things': {'try_chapter': True, 'drop': [1,2,3,4,5], 'clip': [(748, 805)]},\n",
    "    'the_soul_of_the_fire': {'try_chapter': False, 'drop': [1], 'clip': [(8410, 8412)]},\n",
    "    'the_temple_of_the_winds': {'try_chapter': False, 'drop': [1]},\n",
    "    'the_the_omen_machine': {'try_chapter': False, 'drop': [1,88,89], 'clip': [(4469, 4516)]},\n",
    "    'the_the_pillars_of_creation': {'try_chapter': True, 'drop': [1,2], 'clip': [(6146, 6204)]},\n",
    "    'the_thousand_names': {'try_chapter': True, 'drop': [1,2,3,4,5,6]},\n",
    "    'the_tower_of_swallows': {'try_chapter': False, 'drop': [1,24,25,26,27,28], 'clip': [(4355, 4497)]},\n",
    "    'the_two_towers': {'try_chapter': True, 'drop': [1,2,3,4,5,29,30,31,32,33], 'clip': [(2787, 2834)]},\n",
    "    'the_voyage_of_the_dawn_treader': {'try_chapter': False, 'drop': [1,2,3,20,21,22,23,24], 'clip': [(1115, 1139)]},\n",
    "    'the_way_of_kings': {'try_chapter': False, 'drop': [1,2,3,4,5,6,7,8,9,97,98,99], 'clip': [(12698, 12843)]},\n",
    "    'the_well_of_ascension': {'try_chapter': False, 'drop': [1,2,3,4,5,6,7,8,9,10,11,141,142,143,144,145,146], 'clip': [(9335, 9343)]},\n",
    "    'theft_of_swords': {'try_chapter': False, 'drop': [1,2,3,4,5,6,35]},\n",
    "    'time_of_the_twins': {'try_chapter': True, 'drop': [1,2,3], 'clip': [(3578, 3579)]},\n",
    "    'towers_of_midnight': {'try_chapter': True, 'drop': [1,2,57], 'clip': [(10750, 10835)]},\n",
    "    'underlord': {'try_chapter': False, 'drop': [1,2,3,27], 'clip': [(3636, 3703)]},\n",
    "    'unsouled': {'try_chapter': True, 'drop': [1,2,3,4,25,26], 'clip': [(2492, 2493)]},\n",
    "    'valour': {'try_chapter': False, 'drop': [1,2,3,4,5,126,127], 'clip': [(7497, 7510)]},\n",
    "    'war_of_the_twins': {'try_chapter': True, 'drop': [1]},\n",
    "    'wintersteel': {'try_chapter': False, 'drop': [1,2,3,32], 'clip': [(6603, 6603)]},\n",
    "    'wintertide': {'try_chapter': True, 'drop': [1,2,3,4,5,75,76,77], 'clip': [(3092, 3112)]},\n",
    "    'words_of_radiance': {'try_chapter': False, 'drop': [1,2,3,4,110,111,112,107,108,109], 'clip': [(15840, 15860)]},\n",
    "    'wrath': {'try_chapter': False, 'drop': [1,2,3,4,5,133,134,135], 'clip': [(8460, 8469)]},\n",
    "}\n",
    "\n",
    "all_books = pd.DataFrame([], columns=['book_name', 'paragraph_ix', 'chapter_ix', 'chapter_title', 'text', 'char_count', 'cumsum_char_count'])\n",
    "for book in sorted(glob('source/*.epub')):\n",
    "    name = os.path.splitext(os.path.basename(book))[0]\n",
    "    print(f\"Processing {name}\")\n",
    "    try_chapter = False\n",
    "    clips = []\n",
    "    drops = []\n",
    "    strip_nl = False\n",
    "    if name in special:\n",
    "        config = special[name]\n",
    "        try_chapter = config.get('try_chapter', False)\n",
    "        if 'clip' in config:\n",
    "            clips = config['clip']\n",
    "        if 'drop' in config:\n",
    "            drops = config['drop']\n",
    "        if 'strip_nl' in config:\n",
    "            strip_nl = config['strip_nl']\n",
    "\n",
    "    lines = parse_ebook_html(book, try_chapter=try_chapter)\n",
    "    new_frame = pd.DataFrame(lines, columns=['book_name', 'paragraph_ix', 'chapter_ix', 'chapter_title', 'text', 'char_count', 'cumsum_char_count'])\n",
    "    for drop in drops:\n",
    "        new_frame = new_frame[new_frame['chapter_ix'] != drop]\n",
    "    for clip in clips:\n",
    "        # we want to remove the paragraph id's that are in the clip range, inclusive\n",
    "        print(f\"Clipping {clip}\")\n",
    "        minv = clip[0]\n",
    "        maxv = clip[1]\n",
    "        idxr = range(minv, maxv + 1)\n",
    "        new_frame = new_frame[~new_frame['paragraph_ix'].isin(idxr)]\n",
    "    if strip_nl:\n",
    "        new_frame['text'] = new_frame['text'].str.replace('\\n', ' ').replace('\\s\\s', '')\n",
    "\n",
    "    new_frame.reset_index(drop=True, inplace=True)\n",
    "\n",
    "    for i in range(200):\n",
    "        row = new_frame.iloc[-i]\n",
    "        print(row['paragraph_ix'], row['text'])\n",
    "    print(end='', flush=True)\n",
    "    #input('okay')\n",
    "\n",
    "    clear_output()\n",
    "    \n",
    "    all_books = pd.concat([all_books, new_frame.copy()], ignore_index=True)\n",
    "all_books.to_parquet('swords.parquet')\n",
    "print(f\"Saved {len(all_books)} paragraphs to swords.parquet\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import pandas as pd\n",
    "all_books = pd.read_parquet('swords.parquet')\n",
    "#toks = tokenizer(all_books.iloc[100]['text'], return_tensors='pt').to('cuda')\n",
    "#model = model.to('cuda')\n",
    "#model.sample(input_ids=toks, pad_token_id=tokenizer.pad_token_id, eos_token_id=tokenizer.eos_token_id, return_dict=True)\n",
    "#results = [\n",
    "#    pipe(text) for text in all_books['text'][:1000]\n",
    "#]\n",
    "#results =  pipe(*all_books['text'][:1000].tolist())\n",
    "for book in all_books['book_name'].unique():\n",
    "    bookd = all_books[all_books['book_name'] == book]\n",
    "    if not os.path.exists(f'source/{book}.epub'):\n",
    "        continue\n",
    "    min_chapter = bookd['chapter_ix'].min()\n",
    "    max_chapter = bookd['chapter_ix'].max()\n",
    "    print(book, min_chapter, max_chapter)\n",
    "    for i in range(6):\n",
    "        ch = bookd['chapter_ix'] == min_chapter + i\n",
    "        if not ch.any():\n",
    "            print(book, min_chapter + i, 'missing')\n",
    "            continue\n",
    "        t0 = bookd[ch]['text'].iloc[0]\n",
    "        print(book, min_chapter + i, t0)\n",
    "    for i in range(6,0,-1):\n",
    "        ch = bookd['chapter_ix'] == max_chapter - i\n",
    "        if not ch.any():\n",
    "            print(book, max_chapter - i, 'missing')\n",
    "            continue\n",
    "        tn = bookd[ch]['text'].iloc[0]\n",
    "        print(book, max_chapter - i, tn)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "all_books.head(10)\n",
    "last_ix = 0\n",
    "tops = []\n",
    "for i, r in all_books.iterrows():\n",
    "    if last_ix != r['chapter_ix']:\n",
    "        tops.append(i)\n",
    "        last_ix = r['chapter_ix']\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "pd.set_option('display.max_colwidth', None)\n",
    "pd.set_option('display.max_rows', None)\n",
    "all_books.iloc[tops]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from glob import glob\n",
    "import os\n",
    "import re\n",
    "\n",
    "if not os.path.exists('./source'):\n",
    "    os.mkdir('./source')\n",
    "for filename in glob('./incoming/*.epub'):\n",
    "    try:\n",
    "        basename = os.path.basename(filename)\n",
    "        # slugify\n",
    "        noext = os.path.splitext(basename)[0]\n",
    "        noext = noext.split(' - ')[1]\n",
    "        noext = re.sub(r'[^a-zA-Z0-9 ]', '', noext)\n",
    "        noext = re.sub(r'\\s+', '_', noext)\n",
    "        noext = noext.lower()\n",
    "        noext = noext[:64]\n",
    "        # Copy the file to ./source\n",
    "        os.system(f\"cp '{filename}' ./source/{noext}.epub\")\n",
    "        #print(f\"copied {filename} to {noext}.epub\")\n",
    "        print(noext, basename)\n",
    "    except OSError as e:\n",
    "        print(\"Error: %s : %s\" % (filename, e.strerror))"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "jupyterenv",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}