Skip to content

Instantly share code, notes, and snippets.

@maldevide
Created April 4, 2024 18:30
Show Gist options
  • Save maldevide/34526cdd7ef2dbadca8c52c426d122a2 to your computer and use it in GitHub Desktop.
Save maldevide/34526cdd7ef2dbadca8c52c426d122a2 to your computer and use it in GitHub Desktop.
epub book processing
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"from bs4 import BeautifulSoup, NavigableString, Tag\n",
"import ebooklib\n",
"from ebooklib import epub\n",
"import os\n",
"import re\n",
"from typing import Generator, List\n",
"\n",
"def parse_ebook_html(ebook_path: str, try_chapter : bool = False) -> Generator[tuple, None, None]:\n",
" \"\"\"\n",
" Parses the HTML content of an EPUB file, yielding only text content from each <p> block,\n",
" while skipping specific elements with class 'calibre3' but considering valid text that follows.\n",
"\n",
" Parameters:\n",
" - ebook_path (str): The path to the EPUB file.\n",
" - try_chapter (bool): If True, the first paragraph of each chapter will be used to determine the chapter title.\n",
"\n",
" Returns:\n",
" - text_generator (Generator[tuple, None, None]): A generator yielding text content.\n",
" \"\"\"\n",
" book = epub.read_epub(ebook_path)\n",
" basename = os.path.basename(ebook_path)\n",
" noext = os.path.splitext(basename)[0]\n",
" chapter_idx = 0\n",
" paragraph_idx = 0\n",
" cumsum_word_count = 0\n",
" for item in book.get_items_of_type(ebooklib.ITEM_DOCUMENT):\n",
" content = item.get_content().decode('utf-8')\n",
" results = list(html_tokenizer(content, try_chapter))\n",
" if len(results) == 0:\n",
" continue\n",
" chapter_idx += 1\n",
" for row in results:\n",
" if len(row[1]) == 0:\n",
" continue\n",
" paragraph_idx += 1\n",
" word_count = len((row[1]))\n",
" cumsum_word_count += word_count\n",
" row = [noext, paragraph_idx, chapter_idx] + list(row[:]) + [word_count, cumsum_word_count]\n",
" yield tuple(row)\n",
"\n",
"def html_tokenizer(html_content: str, try_chapter) -> Generator[tuple, None, None]:\n",
" \"\"\"\n",
" Generator function to tokenize HTML content, yielding text content from each <p> block.\n",
"\n",
" Parameters:\n",
" - html_content (str): The HTML content to be tokenized.\n",
" - try_chapter (bool): If True, the first paragraph of each chapter will be used to determine the chapter title.\n",
"\n",
" Yields:\n",
" - text_generator (Generator[tuple, None, None]): A generator yielding text content. \n",
" \"\"\"\n",
" soup = BeautifulSoup(html_content, 'html.parser')\n",
" fix_quote = re.compile(r'“|”|»|«')\n",
" fix_threedot = re.compile(r'…')\n",
" fix_bars = re.compile(r'\\|\\s*\\|')\n",
" fix_spaces = re.compile(r'\\s+')\n",
"\n",
" def extract_and_yield_text(element, accumulated_texts: List[str]):\n",
" if isinstance(element, NavigableString):\n",
" accumulated_texts.append(str(element))\n",
" elif isinstance(element, Tag):\n",
" if element.name == 'a' and 'calibre3' in element.get('class', []):\n",
" # Skip processing the <a class=\"calibre3\"> tag itself, but not its siblings\n",
" #print('skipping', element)\n",
" return\n",
" if element.name == 'span' and 'italic' in element.get('class', []):\n",
" # Append italic text directly to the accumulated_texts list without yielding\n",
" accumulated_texts.append(element.get_text())\n",
" else:\n",
" # Recursively process all children, including those following skipped elements\n",
" for child in element.children:\n",
" extract_and_yield_text(child, accumulated_texts)\n",
"\n",
" chapter = None\n",
" for i, p_tag in enumerate(soup.find_all('p')):\n",
" accumulated_texts = []\n",
" # if p's class is calibre14, skip it because it's metadata\n",
" if 'calibre14' in p_tag.get('class', []):\n",
" #print('skipping', i)\n",
" #continue\n",
" pass\n",
" else:\n",
" #print('processing', i)\n",
" if i == 0 and try_chapter:\n",
" # Instead of processing, this contains our chapter and title\n",
" markers = []\n",
" for span in p_tag.find_all('span', class_='bold'):\n",
" markers.append(span.get_text())\n",
"\n",
" if len(markers) >= 2:\n",
" chapter = ' '.join(markers)\n",
" continue\n",
" \n",
" extract_and_yield_text(p_tag, accumulated_texts)\n",
" # if our text is '| |', skip it\n",
" if '| |' in ' '.join(accumulated_texts):\n",
" continue\n",
" text = ' '.join([text.strip() for text in accumulated_texts if text.strip()])\n",
" text = text.replace('\\n', ' ')\n",
" text = text.replace(u'\\xa0', u' ')\n",
" text = fix_quote.sub(u'\"', text)\n",
" text = fix_threedot.sub(u'...', text)\n",
" text = fix_bars.sub(u'', text)\n",
" text = fix_spaces.sub(u' ', text)\n",
" text = text.strip()\n",
" if text.find('Oceano') != -1:\n",
" continue\n",
" # If the first character is a capital letter, then a space, followed by more capital letters, it is likely the beginning of a chapter and needs to have the space removed\n",
" if len(text) == 0:\n",
" continue\n",
" if len(text) < 4 and text.isnumeric():\n",
" continue\n",
" #elif len(text) > 2 and text[0].isupper() and text[1] == ' ' and text[2].isupper():\n",
" # text = text[0] + text[2:]\n",
" yield chapter, text\n"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Saved 615508 paragraphs to swords.parquet\n"
]
}
],
"source": [
"from glob import glob\n",
"import pandas as pd\n",
"from IPython.display import clear_output\n",
"import IPython.utils\n",
"\n",
"# special rules.\n",
"# cryptonomicon requires try_chapter=True, and needs '| |' removed\n",
"special = {\n",
" 'a_clash_of_kings': {'try_chapter': True, 'drop': [1,2,76,75,74,73,72]},\n",
" 'a_court_of_frost_and_starlight' : {'drop': [1,2,3,34,35,36,37]},\n",
" 'a_court_of_mist_and_fury': {'try_chapter': True, 'drop': [1,2,3,81,80,79,78,77,76,75]},\n",
" 'a_court_of_silver_flames': {'try_chapter': True, 'drop': [1,2,3,4,90,89]},\n",
" 'a_court_of_thorns_and_roses': { 'drop': [1,2, 52,51,50,49]},\n",
" 'a_court_of_wings_and_ruin': {'try_chapter': True, 'drop': [1,2,3,4,5,94,93,92,91,90]},\n",
" 'a_crown_of_swords': {'try_chapter': True, 'drop': [1,2,3,4,5,52,51,50,49,48,47]},\n",
" 'a_dance_with_dragons': { 'drop': [1,2,3,4,5,105,104,103,102,101,100,99,98,97,96,95,94,93,92,91,90,89,88,87,86,85,84,83,82,81,80,79,78]},\n",
" 'a_feast_for_crows': {'try_chapter': True, 'drop': [1,2,3,4,60,59,58,57,56,55,54,53,52,51,50]},\n",
" 'a_game_of_thrones': {'try_chapter': False, 'drop': [1,2,3,85,84,83,82,81,80,79,78,77,76], 'clip': [(6808, 6811)]},\n",
" 'a_memory_of_light': { 'drop': [], 'clip': [(11168, 11236)]},\n",
" 'a_storm_of_swords': {'try_chapter': True, 'drop': [1,2,93,92,91,90,89,88,87,86,85]},\n",
" 'a_time_of_blood': {'try_chapter': False, 'drop': [1,2,3,69,68,62,63,64,65,66,67]},\n",
" 'a_time_of_courage': {'try_chapter': False, 'drop': [1,2,3,4,133,132,131,130,129,128,127], 'clip': [(7539, 7539)]},\n",
" 'a_time_of_dread': {'try_chapter': False, 'drop': [1,2,3,4,5,61,60,59,58], 'clip': [(4942, 4943)]},\n",
" 'a_wizard_of_earthsea': {'try_chapter': False, 'drop': [1,2,3], 'clip': [(1041, 1041)]},\n",
" 'alice_in_wonderland': {'try_chapter': False, 'drop': [1,2,3]},\n",
" 'avempartha': {'try_chapter': True, 'drop': [1], 'clip': [(3494, 3533)]},\n",
" 'baptism_of_fire': {'try_chapter': False, 'drop': [1,2,3,4,14,15,16,17]},\n",
" 'blackflame': {'try_chapter': False, 'drop': [1,2,27,26], 'clip': [(3710, 3712)]},\n",
" 'blade_of_tyshalle': {'try_chapter': False, 'drop': [1,2,33,32], 'clip': [(9070, 9071)]},\n",
" 'blood_of_elves': {'try_chapter': False, 'drop': [1,2]},\n",
" 'blood_of_the_fold': {'try_chapter': False, 'drop': [56], 'clip': [(7072, 7073)]},\n",
" 'bloodline': {'try_chapter': False, 'drop': [1,2,26], 'clip': [(4559, 4644)]},\n",
" 'caine_black_knife': {'try_chapter': True, 'drop': [1,2,3,4,5,28,29,30], 'clip': [(5852, 5861)]},\n",
" 'chains_of_gaia': {'try_chapter': True, 'drop': [1,2,3,4], 'clip': [(4162, 4165)]},\n",
" 'crossroads_of_twilight': {'try_chapter': False, 'drop': [], 'clip': [(2529, 2692)], 'strip_nl': True},\n",
" 'crown_conspiracy': {'try_chapter': False, 'drop': []},\n",
" 'death_masks': {'try_chapter': False, 'drop': []},\n",
" 'dragon_reborn': {'try_chapter': False, 'drop': [1,2,3,4,5,6,65], 'clip': [(4818, 4999)]},\n",
" 'dragons_of_autumn_twilight': {'try_chapter': False, 'drop': [1,2], 'stip_nl': True},\n",
" 'dragons_of_spring_dawning': {'try_chapter': False, 'drop': [1,2,3,4,42], 'clip': [(3552, 3556)]},\n",
" 'dragons_of_winter_night': {'try_chapter': False, 'drop': [1,2,3,4], 'clip': [(3865, 3869)]},\n",
" 'eye_of_the_world_the': {'try_chapter': False, 'drop': [1,2,3,4,5,61,60], 'clip': [(5955, 6211)]},\n",
" 'fellowship_of_the_ring': {'try_chapter': False, 'drop': [], 'clip': [(3810, 3811)]},\n",
" 'ghostwater': {'try_chapter': False, 'drop': [1,2,3,24], 'clip': [(3275, 3299)]},\n",
" 'good_omens': {'try_chapter': False, 'drop': [1,2,3,4], 'clip': [(4616, 4814)]},\n",
" 'graceling': {'try_chapter': True, 'drop': [1,2,3,4,5,6,7,50], 'clip': [(3647, 3650)]},\n",
" 'great_hunt': {'try_chapter': True, 'drop': [55,56,57,58,59,60], 'clip': [(5241, 5535)]},\n",
" 'heir_of_novron': {'try_chapter': False, 'drop': [1,2,3,4,5,59,60,61]},\n",
" 'heroes_die': {'try_chapter': True, 'drop': [1,2,3,4,5], 'clip': [(6451, 6451)]},\n",
" 'isle_of_winds': {'try_chapter': True, 'drop': [1,2,3,4], 'clip': [(2990, 2993)]},\n",
" 'knife_of_dreams': {'try_chapter': False, 'drop': [1,2,3,4,5,6,7,8,9,49], 'clip': [(4436, 4521)]},\n",
" 'lady_of_avalon': {'try_chapter': False, 'drop': [1,2,3,4,5,6,39,40], 'clip': [(3812, 3812)]},\n",
" 'lion_the_witch_and_the_wardrobe': {'try_chapter': False, 'drop': [1]},\n",
" 'lord_of_chaos': {'try_chapter': True, 'drop': [1]},\n",
" 'malice': {'try_chapter': True, 'drop': [1,2,3,4,5,96], 'clip': [(7728, 7823)]},\n",
" 'mistborn_the_final_empire': {'try_chapter': True, 'drop': [1,2,3,4,43,44,45], 'clip': [(7687, 7726)]},\n",
" 'mists_of_avalon': {'try_chapter': True, 'drop': [1,2,3,4,5,74,75,76,77]},\n",
" 'nyphron_rising': {'try_chapter': False, 'drop': [1,2], 'clip': [(3473, 3482)], 'strip_nl': True},\n",
" 'oathbringer': {'try_chapter': True, 'drop': [1,2,3,4,5,6]},\n",
" 'prince_caspian': {'try_chapter': False, 'drop': []},\n",
" 'ravens_of_avalon': {'try_chapter': False, 'drop': [1,6]},\n",
" 'reaper': {'try_chapter': False, 'drop': [1,2,31], 'clip': [(5290, 5315)]},\n",
" 'red_seas_under_red_skies': {'try_chapter': True, 'drop': [1,2,3,26,27,28,29,30,31], 'clip': [(7146, 7361)]},\n",
" 'return_of_the_king': {'try_chapter': False, 'drop': [1,2,3,4,5,6,7,8,9,10,119,120,121,122,123,124], 'clip': [(4895, 5083)]},\n",
" 'rise_of_empire': {'try_chapter': False, 'drop': [1,2,3,4,5,6,51,52,53]},\n",
" 'ruin_us': {'try_chapter': False, 'drop': [1,2,3,4,5,6,102,103,104,105,106,107]},\n",
" 'skysworn': {'try_chapter': False, 'drop': [1,2,3,4,5,22], 'clip': [(2875, 2903)]},\n",
" 'soulsmith': {'try_chapter': True, 'drop': [1,2,3,4,25,26], 'clip': [(2573, 2574)]},\n",
" 'stone_of_tears': {'try_chapter': False, 'drop': [1]},\n",
" 'storm_front': {'try_chapter': True, 'drop': [1,2], 'clip': [(2208, 2234)]},\n",
" 'summer_knight': {'try_chapter': False, 'drop': []},\n",
" 'sword_of_destiny': {'try_chapter': False, 'drop': [1]},\n",
" 'test_of_the_twins': {'try_chapter': False, 'drop': [1,2,3,4,5,44,45], 'clip': [(2673, 2677)], 'strip_nl': True},\n",
" 'the_chainfire': {'try_chapter': False, 'drop': [1]},\n",
" 'the_confessor': {'try_chapter': False, 'drop': [1], 'clip': [(6800, 6800)]},\n",
" 'the_crown_tower': {'try_chapter': False, 'drop': [1,2,3,4,5,6,7,8,32,33,34,31]},\n",
" 'the_drowned_tomb': {'try_chapter': True, 'drop': [1,2,3,4], 'clip': [(3342, 3345)]},\n",
" 'the_emerald_storm': {'try_chapter': False, 'drop': [1]},\n",
" 'the_exercise_of_vital_powers': {'try_chapter': False, 'drop': [1,2,3,4,5,6,30,31], 'clip': [(3785, 3788)]},\n",
" 'the_faith_of_the_fallen': {'try_chapter': False, 'drop': [1]},\n",
" 'the_fires_of_heaven': {'try_chapter': True, 'drop': [1]},\n",
" 'the_forest_house': {'try_chapter': False, 'drop': [1,2,3,4,5,6,7,39,40]},\n",
" 'the_gathering_storm': {'try_chapter': False, 'drop': [1,2,3,4,5,6,7,64,65], 'clip': [(7915, 8006)]},\n",
" 'the_guns_of_empire': {'try_chapter': False, 'drop': [1,2,3,4,5]},\n",
" 'the_hero_of_ages': {'try_chapter': False, 'drop': [1,2,3,4,5,6,7,99,100,101]},\n",
" 'the_hobbit_or_there_and_back_again': {'try_chapter': False, 'drop': [1,24,25,26,27,28], 'clip': [(1845, 1970)], 'strip_nl': True},\n",
" 'the_horse_and_his_boy': {'try_chapter': False, 'drop': [1,2,23,24], 'clip': [(940, 964)]},\n",
" 'the_infernal_battalion': {'try_chapter': False, 'drop': [1,2,3,45,46,47,48], 'clip': [(6135, 6166)]},\n",
" 'the_lady_of_the_lake': {'try_chapter': False, 'drop': [1,2]},\n",
" 'the_last_battle': {'try_chapter': False, 'drop': [1,2,19,20,21,22,23], 'clip': [(940, 964)]},\n",
" 'the_last_wish': {'try_chapter': False, 'drop': [1]},\n",
" 'the_lies_of_locke_lamora': {'try_chapter': False, 'drop': [1], 'clip': [(4971, 4984)]},\n",
" 'the_mists_of_avalon': {'try_chapter': False, 'drop': [1], 'clip': [(7834, 7835)]},\n",
" 'the_naked_empire': {'try_chapter': False, 'drop': [1]},\n",
" 'the_path_of_daggers': {'try_chapter': True, 'drop': [1,2,3,4,5,6,39,40], 'clip': [(2867, 2907)]},\n",
" 'the_phantom': {'try_chapter': False, 'drop': [1,2], 'clip': [(5954, 5955)]},\n",
" 'the_price_of_valor': {'try_chapter': True, 'drop': [1,2,3,4,40], 'clip': [(6195, 6237)]},\n",
" 'the_princess_bride': {'try_chapter': False, 'drop': [1,2,3]},\n",
" 'the_republic_of_thieves': {'try_chapter': False, 'drop': [1,2,175,176,177], 'clip': [(7812, 7812)]},\n",
" 'the_rose_and_the_thorn': {'try_chapter': False, 'drop': [1,27,28,29], 'clip': [(3373, 3462)]},\n",
" 'the_shadow_of_the_gods': {'try_chapter': False, 'drop': [1,2,57,58,59], 'clip': [(5088, 5095)]},\n",
" 'the_shadow_rising': {'try_chapter': False, 'drop': [1,2], 'clip': [(6737, 6755)]},\n",
" 'the_shadow_throne': {'try_chapter': True, 'drop': [1,2,3,4,5]},\n",
" 'the_silver_chair': {'try_chapter': True, 'drop': [1]},\n",
" 'the_slow_regard_of_silent_things': {'try_chapter': True, 'drop': [1,2,3,4,5], 'clip': [(748, 805)]},\n",
" 'the_soul_of_the_fire': {'try_chapter': False, 'drop': [1], 'clip': [(8410, 8412)]},\n",
" 'the_temple_of_the_winds': {'try_chapter': False, 'drop': [1]},\n",
" 'the_the_omen_machine': {'try_chapter': False, 'drop': [1,88,89], 'clip': [(4469, 4516)]},\n",
" 'the_the_pillars_of_creation': {'try_chapter': True, 'drop': [1,2], 'clip': [(6146, 6204)]},\n",
" 'the_thousand_names': {'try_chapter': True, 'drop': [1,2,3,4,5,6]},\n",
" 'the_tower_of_swallows': {'try_chapter': False, 'drop': [1,24,25,26,27,28], 'clip': [(4355, 4497)]},\n",
" 'the_two_towers': {'try_chapter': True, 'drop': [1,2,3,4,5,29,30,31,32,33], 'clip': [(2787, 2834)]},\n",
" 'the_voyage_of_the_dawn_treader': {'try_chapter': False, 'drop': [1,2,3,20,21,22,23,24], 'clip': [(1115, 1139)]},\n",
" 'the_way_of_kings': {'try_chapter': False, 'drop': [1,2,3,4,5,6,7,8,9,97,98,99], 'clip': [(12698, 12843)]},\n",
" 'the_well_of_ascension': {'try_chapter': False, 'drop': [1,2,3,4,5,6,7,8,9,10,11,141,142,143,144,145,146], 'clip': [(9335, 9343)]},\n",
" 'theft_of_swords': {'try_chapter': False, 'drop': [1,2,3,4,5,6,35]},\n",
" 'time_of_the_twins': {'try_chapter': True, 'drop': [1,2,3], 'clip': [(3578, 3579)]},\n",
" 'towers_of_midnight': {'try_chapter': True, 'drop': [1,2,57], 'clip': [(10750, 10835)]},\n",
" 'underlord': {'try_chapter': False, 'drop': [1,2,3,27], 'clip': [(3636, 3703)]},\n",
" 'unsouled': {'try_chapter': True, 'drop': [1,2,3,4,25,26], 'clip': [(2492, 2493)]},\n",
" 'valour': {'try_chapter': False, 'drop': [1,2,3,4,5,126,127], 'clip': [(7497, 7510)]},\n",
" 'war_of_the_twins': {'try_chapter': True, 'drop': [1]},\n",
" 'wintersteel': {'try_chapter': False, 'drop': [1,2,3,32], 'clip': [(6603, 6603)]},\n",
" 'wintertide': {'try_chapter': True, 'drop': [1,2,3,4,5,75,76,77], 'clip': [(3092, 3112)]},\n",
" 'words_of_radiance': {'try_chapter': False, 'drop': [1,2,3,4,110,111,112,107,108,109], 'clip': [(15840, 15860)]},\n",
" 'wrath': {'try_chapter': False, 'drop': [1,2,3,4,5,133,134,135], 'clip': [(8460, 8469)]},\n",
"}\n",
"\n",
"all_books = pd.DataFrame([], columns=['book_name', 'paragraph_ix', 'chapter_ix', 'chapter_title', 'text', 'char_count', 'cumsum_char_count'])\n",
"for book in sorted(glob('source/*.epub')):\n",
" name = os.path.splitext(os.path.basename(book))[0]\n",
" print(f\"Processing {name}\")\n",
" try_chapter = False\n",
" clips = []\n",
" drops = []\n",
" strip_nl = False\n",
" if name in special:\n",
" config = special[name]\n",
" try_chapter = config.get('try_chapter', False)\n",
" if 'clip' in config:\n",
" clips = config['clip']\n",
" if 'drop' in config:\n",
" drops = config['drop']\n",
" if 'strip_nl' in config:\n",
" strip_nl = config['strip_nl']\n",
"\n",
" lines = parse_ebook_html(book, try_chapter=try_chapter)\n",
" new_frame = pd.DataFrame(lines, columns=['book_name', 'paragraph_ix', 'chapter_ix', 'chapter_title', 'text', 'char_count', 'cumsum_char_count'])\n",
" for drop in drops:\n",
" new_frame = new_frame[new_frame['chapter_ix'] != drop]\n",
" for clip in clips:\n",
" # we want to remove the paragraph id's that are in the clip range, inclusive\n",
" print(f\"Clipping {clip}\")\n",
" minv = clip[0]\n",
" maxv = clip[1]\n",
" idxr = range(minv, maxv + 1)\n",
" new_frame = new_frame[~new_frame['paragraph_ix'].isin(idxr)]\n",
" if strip_nl:\n",
" new_frame['text'] = new_frame['text'].str.replace('\\n', ' ').replace('\\s\\s', '')\n",
"\n",
" new_frame.reset_index(drop=True, inplace=True)\n",
"\n",
" for i in range(200):\n",
" row = new_frame.iloc[-i]\n",
" print(row['paragraph_ix'], row['text'])\n",
" print(end='', flush=True)\n",
" #input('okay')\n",
"\n",
" clear_output()\n",
" \n",
" all_books = pd.concat([all_books, new_frame.copy()], ignore_index=True)\n",
"all_books.to_parquet('swords.parquet')\n",
"print(f\"Saved {len(all_books)} paragraphs to swords.parquet\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"import pandas as pd\n",
"all_books = pd.read_parquet('swords.parquet')\n",
"#toks = tokenizer(all_books.iloc[100]['text'], return_tensors='pt').to('cuda')\n",
"#model = model.to('cuda')\n",
"#model.sample(input_ids=toks, pad_token_id=tokenizer.pad_token_id, eos_token_id=tokenizer.eos_token_id, return_dict=True)\n",
"#results = [\n",
"# pipe(text) for text in all_books['text'][:1000]\n",
"#]\n",
"#results = pipe(*all_books['text'][:1000].tolist())\n",
"for book in all_books['book_name'].unique():\n",
" bookd = all_books[all_books['book_name'] == book]\n",
" if not os.path.exists(f'source/{book}.epub'):\n",
" continue\n",
" min_chapter = bookd['chapter_ix'].min()\n",
" max_chapter = bookd['chapter_ix'].max()\n",
" print(book, min_chapter, max_chapter)\n",
" for i in range(6):\n",
" ch = bookd['chapter_ix'] == min_chapter + i\n",
" if not ch.any():\n",
" print(book, min_chapter + i, 'missing')\n",
" continue\n",
" t0 = bookd[ch]['text'].iloc[0]\n",
" print(book, min_chapter + i, t0)\n",
" for i in range(6,0,-1):\n",
" ch = bookd['chapter_ix'] == max_chapter - i\n",
" if not ch.any():\n",
" print(book, max_chapter - i, 'missing')\n",
" continue\n",
" tn = bookd[ch]['text'].iloc[0]\n",
" print(book, max_chapter - i, tn)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"all_books.head(10)\n",
"last_ix = 0\n",
"tops = []\n",
"for i, r in all_books.iterrows():\n",
" if last_ix != r['chapter_ix']:\n",
" tops.append(i)\n",
" last_ix = r['chapter_ix']\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"pd.set_option('display.max_colwidth', None)\n",
"pd.set_option('display.max_rows', None)\n",
"all_books.iloc[tops]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from glob import glob\n",
"import os\n",
"import re\n",
"\n",
"if not os.path.exists('./source'):\n",
" os.mkdir('./source')\n",
"for filename in glob('./incoming/*.epub'):\n",
" try:\n",
" basename = os.path.basename(filename)\n",
" # slugify\n",
" noext = os.path.splitext(basename)[0]\n",
" noext = noext.split(' - ')[1]\n",
" noext = re.sub(r'[^a-zA-Z0-9 ]', '', noext)\n",
" noext = re.sub(r'\\s+', '_', noext)\n",
" noext = noext.lower()\n",
" noext = noext[:64]\n",
" # Copy the file to ./source\n",
" os.system(f\"cp '{filename}' ./source/{noext}.epub\")\n",
" #print(f\"copied {filename} to {noext}.epub\")\n",
" print(noext, basename)\n",
" except OSError as e:\n",
" print(\"Error: %s : %s\" % (filename, e.strerror))"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "jupyterenv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.12"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment