Created
April 4, 2024 18:30
-
-
Save maldevide/34526cdd7ef2dbadca8c52c426d122a2 to your computer and use it in GitHub Desktop.
epub book processing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"from bs4 import BeautifulSoup, NavigableString, Tag\n", | |
"import ebooklib\n", | |
"from ebooklib import epub\n", | |
"import os\n", | |
"import re\n", | |
"from typing import Generator, List\n", | |
"\n", | |
"def parse_ebook_html(ebook_path: str, try_chapter : bool = False) -> Generator[tuple, None, None]:\n", | |
" \"\"\"\n", | |
" Parses the HTML content of an EPUB file, yielding only text content from each <p> block,\n", | |
" while skipping specific elements with class 'calibre3' but considering valid text that follows.\n", | |
"\n", | |
" Parameters:\n", | |
" - ebook_path (str): The path to the EPUB file.\n", | |
" - try_chapter (bool): If True, the first paragraph of each chapter will be used to determine the chapter title.\n", | |
"\n", | |
" Returns:\n", | |
" - text_generator (Generator[tuple, None, None]): A generator yielding text content.\n", | |
" \"\"\"\n", | |
" book = epub.read_epub(ebook_path)\n", | |
" basename = os.path.basename(ebook_path)\n", | |
" noext = os.path.splitext(basename)[0]\n", | |
" chapter_idx = 0\n", | |
" paragraph_idx = 0\n", | |
" cumsum_word_count = 0\n", | |
" for item in book.get_items_of_type(ebooklib.ITEM_DOCUMENT):\n", | |
" content = item.get_content().decode('utf-8')\n", | |
" results = list(html_tokenizer(content, try_chapter))\n", | |
" if len(results) == 0:\n", | |
" continue\n", | |
" chapter_idx += 1\n", | |
" for row in results:\n", | |
" if len(row[1]) == 0:\n", | |
" continue\n", | |
" paragraph_idx += 1\n", | |
" word_count = len((row[1]))\n", | |
" cumsum_word_count += word_count\n", | |
" row = [noext, paragraph_idx, chapter_idx] + list(row[:]) + [word_count, cumsum_word_count]\n", | |
" yield tuple(row)\n", | |
"\n", | |
"def html_tokenizer(html_content: str, try_chapter) -> Generator[tuple, None, None]:\n", | |
" \"\"\"\n", | |
" Generator function to tokenize HTML content, yielding text content from each <p> block.\n", | |
"\n", | |
" Parameters:\n", | |
" - html_content (str): The HTML content to be tokenized.\n", | |
" - try_chapter (bool): If True, the first paragraph of each chapter will be used to determine the chapter title.\n", | |
"\n", | |
" Yields:\n", | |
" - text_generator (Generator[tuple, None, None]): A generator yielding text content. \n", | |
" \"\"\"\n", | |
" soup = BeautifulSoup(html_content, 'html.parser')\n", | |
" fix_quote = re.compile(r'“|”|»|«')\n", | |
" fix_threedot = re.compile(r'…')\n", | |
" fix_bars = re.compile(r'\\|\\s*\\|')\n", | |
" fix_spaces = re.compile(r'\\s+')\n", | |
"\n", | |
" def extract_and_yield_text(element, accumulated_texts: List[str]):\n", | |
" if isinstance(element, NavigableString):\n", | |
" accumulated_texts.append(str(element))\n", | |
" elif isinstance(element, Tag):\n", | |
" if element.name == 'a' and 'calibre3' in element.get('class', []):\n", | |
" # Skip processing the <a class=\"calibre3\"> tag itself, but not its siblings\n", | |
" #print('skipping', element)\n", | |
" return\n", | |
" if element.name == 'span' and 'italic' in element.get('class', []):\n", | |
" # Append italic text directly to the accumulated_texts list without yielding\n", | |
" accumulated_texts.append(element.get_text())\n", | |
" else:\n", | |
" # Recursively process all children, including those following skipped elements\n", | |
" for child in element.children:\n", | |
" extract_and_yield_text(child, accumulated_texts)\n", | |
"\n", | |
" chapter = None\n", | |
" for i, p_tag in enumerate(soup.find_all('p')):\n", | |
" accumulated_texts = []\n", | |
" # if p's class is calibre14, skip it because it's metadata\n", | |
" if 'calibre14' in p_tag.get('class', []):\n", | |
" #print('skipping', i)\n", | |
" #continue\n", | |
" pass\n", | |
" else:\n", | |
" #print('processing', i)\n", | |
" if i == 0 and try_chapter:\n", | |
" # Instead of processing, this contains our chapter and title\n", | |
" markers = []\n", | |
" for span in p_tag.find_all('span', class_='bold'):\n", | |
" markers.append(span.get_text())\n", | |
"\n", | |
" if len(markers) >= 2:\n", | |
" chapter = ' '.join(markers)\n", | |
" continue\n", | |
" \n", | |
" extract_and_yield_text(p_tag, accumulated_texts)\n", | |
" # if our text is '| |', skip it\n", | |
" if '| |' in ' '.join(accumulated_texts):\n", | |
" continue\n", | |
" text = ' '.join([text.strip() for text in accumulated_texts if text.strip()])\n", | |
" text = text.replace('\\n', ' ')\n", | |
" text = text.replace(u'\\xa0', u' ')\n", | |
" text = fix_quote.sub(u'\"', text)\n", | |
" text = fix_threedot.sub(u'...', text)\n", | |
" text = fix_bars.sub(u'', text)\n", | |
" text = fix_spaces.sub(u' ', text)\n", | |
" text = text.strip()\n", | |
" if text.find('Oceano') != -1:\n", | |
" continue\n", | |
" # If the first character is a capital letter, then a space, followed by more capital letters, it is likely the beginning of a chapter and needs to have the space removed\n", | |
" if len(text) == 0:\n", | |
" continue\n", | |
" if len(text) < 4 and text.isnumeric():\n", | |
" continue\n", | |
" #elif len(text) > 2 and text[0].isupper() and text[1] == ' ' and text[2].isupper():\n", | |
" # text = text[0] + text[2:]\n", | |
" yield chapter, text\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Saved 615508 paragraphs to swords.parquet\n" | |
] | |
} | |
], | |
"source": [ | |
"from glob import glob\n", | |
"import pandas as pd\n", | |
"from IPython.display import clear_output\n", | |
"import IPython.utils\n", | |
"\n", | |
"# special rules.\n", | |
"# cryptonomicon requires try_chapter=True, and needs '| |' removed\n", | |
"special = {\n", | |
" 'a_clash_of_kings': {'try_chapter': True, 'drop': [1,2,76,75,74,73,72]},\n", | |
" 'a_court_of_frost_and_starlight' : {'drop': [1,2,3,34,35,36,37]},\n", | |
" 'a_court_of_mist_and_fury': {'try_chapter': True, 'drop': [1,2,3,81,80,79,78,77,76,75]},\n", | |
" 'a_court_of_silver_flames': {'try_chapter': True, 'drop': [1,2,3,4,90,89]},\n", | |
" 'a_court_of_thorns_and_roses': { 'drop': [1,2, 52,51,50,49]},\n", | |
" 'a_court_of_wings_and_ruin': {'try_chapter': True, 'drop': [1,2,3,4,5,94,93,92,91,90]},\n", | |
" 'a_crown_of_swords': {'try_chapter': True, 'drop': [1,2,3,4,5,52,51,50,49,48,47]},\n", | |
" 'a_dance_with_dragons': { 'drop': [1,2,3,4,5,105,104,103,102,101,100,99,98,97,96,95,94,93,92,91,90,89,88,87,86,85,84,83,82,81,80,79,78]},\n", | |
" 'a_feast_for_crows': {'try_chapter': True, 'drop': [1,2,3,4,60,59,58,57,56,55,54,53,52,51,50]},\n", | |
" 'a_game_of_thrones': {'try_chapter': False, 'drop': [1,2,3,85,84,83,82,81,80,79,78,77,76], 'clip': [(6808, 6811)]},\n", | |
" 'a_memory_of_light': { 'drop': [], 'clip': [(11168, 11236)]},\n", | |
" 'a_storm_of_swords': {'try_chapter': True, 'drop': [1,2,93,92,91,90,89,88,87,86,85]},\n", | |
" 'a_time_of_blood': {'try_chapter': False, 'drop': [1,2,3,69,68,62,63,64,65,66,67]},\n", | |
" 'a_time_of_courage': {'try_chapter': False, 'drop': [1,2,3,4,133,132,131,130,129,128,127], 'clip': [(7539, 7539)]},\n", | |
" 'a_time_of_dread': {'try_chapter': False, 'drop': [1,2,3,4,5,61,60,59,58], 'clip': [(4942, 4943)]},\n", | |
" 'a_wizard_of_earthsea': {'try_chapter': False, 'drop': [1,2,3], 'clip': [(1041, 1041)]},\n", | |
" 'alice_in_wonderland': {'try_chapter': False, 'drop': [1,2,3]},\n", | |
" 'avempartha': {'try_chapter': True, 'drop': [1], 'clip': [(3494, 3533)]},\n", | |
" 'baptism_of_fire': {'try_chapter': False, 'drop': [1,2,3,4,14,15,16,17]},\n", | |
" 'blackflame': {'try_chapter': False, 'drop': [1,2,27,26], 'clip': [(3710, 3712)]},\n", | |
" 'blade_of_tyshalle': {'try_chapter': False, 'drop': [1,2,33,32], 'clip': [(9070, 9071)]},\n", | |
" 'blood_of_elves': {'try_chapter': False, 'drop': [1,2]},\n", | |
" 'blood_of_the_fold': {'try_chapter': False, 'drop': [56], 'clip': [(7072, 7073)]},\n", | |
" 'bloodline': {'try_chapter': False, 'drop': [1,2,26], 'clip': [(4559, 4644)]},\n", | |
" 'caine_black_knife': {'try_chapter': True, 'drop': [1,2,3,4,5,28,29,30], 'clip': [(5852, 5861)]},\n", | |
" 'chains_of_gaia': {'try_chapter': True, 'drop': [1,2,3,4], 'clip': [(4162, 4165)]},\n", | |
" 'crossroads_of_twilight': {'try_chapter': False, 'drop': [], 'clip': [(2529, 2692)], 'strip_nl': True},\n", | |
" 'crown_conspiracy': {'try_chapter': False, 'drop': []},\n", | |
" 'death_masks': {'try_chapter': False, 'drop': []},\n", | |
" 'dragon_reborn': {'try_chapter': False, 'drop': [1,2,3,4,5,6,65], 'clip': [(4818, 4999)]},\n", | |
" 'dragons_of_autumn_twilight': {'try_chapter': False, 'drop': [1,2], 'stip_nl': True},\n", | |
" 'dragons_of_spring_dawning': {'try_chapter': False, 'drop': [1,2,3,4,42], 'clip': [(3552, 3556)]},\n", | |
" 'dragons_of_winter_night': {'try_chapter': False, 'drop': [1,2,3,4], 'clip': [(3865, 3869)]},\n", | |
" 'eye_of_the_world_the': {'try_chapter': False, 'drop': [1,2,3,4,5,61,60], 'clip': [(5955, 6211)]},\n", | |
" 'fellowship_of_the_ring': {'try_chapter': False, 'drop': [], 'clip': [(3810, 3811)]},\n", | |
" 'ghostwater': {'try_chapter': False, 'drop': [1,2,3,24], 'clip': [(3275, 3299)]},\n", | |
" 'good_omens': {'try_chapter': False, 'drop': [1,2,3,4], 'clip': [(4616, 4814)]},\n", | |
" 'graceling': {'try_chapter': True, 'drop': [1,2,3,4,5,6,7,50], 'clip': [(3647, 3650)]},\n", | |
" 'great_hunt': {'try_chapter': True, 'drop': [55,56,57,58,59,60], 'clip': [(5241, 5535)]},\n", | |
" 'heir_of_novron': {'try_chapter': False, 'drop': [1,2,3,4,5,59,60,61]},\n", | |
" 'heroes_die': {'try_chapter': True, 'drop': [1,2,3,4,5], 'clip': [(6451, 6451)]},\n", | |
" 'isle_of_winds': {'try_chapter': True, 'drop': [1,2,3,4], 'clip': [(2990, 2993)]},\n", | |
" 'knife_of_dreams': {'try_chapter': False, 'drop': [1,2,3,4,5,6,7,8,9,49], 'clip': [(4436, 4521)]},\n", | |
" 'lady_of_avalon': {'try_chapter': False, 'drop': [1,2,3,4,5,6,39,40], 'clip': [(3812, 3812)]},\n", | |
" 'lion_the_witch_and_the_wardrobe': {'try_chapter': False, 'drop': [1]},\n", | |
" 'lord_of_chaos': {'try_chapter': True, 'drop': [1]},\n", | |
" 'malice': {'try_chapter': True, 'drop': [1,2,3,4,5,96], 'clip': [(7728, 7823)]},\n", | |
" 'mistborn_the_final_empire': {'try_chapter': True, 'drop': [1,2,3,4,43,44,45], 'clip': [(7687, 7726)]},\n", | |
" 'mists_of_avalon': {'try_chapter': True, 'drop': [1,2,3,4,5,74,75,76,77]},\n", | |
" 'nyphron_rising': {'try_chapter': False, 'drop': [1,2], 'clip': [(3473, 3482)], 'strip_nl': True},\n", | |
" 'oathbringer': {'try_chapter': True, 'drop': [1,2,3,4,5,6]},\n", | |
" 'prince_caspian': {'try_chapter': False, 'drop': []},\n", | |
" 'ravens_of_avalon': {'try_chapter': False, 'drop': [1,6]},\n", | |
" 'reaper': {'try_chapter': False, 'drop': [1,2,31], 'clip': [(5290, 5315)]},\n", | |
" 'red_seas_under_red_skies': {'try_chapter': True, 'drop': [1,2,3,26,27,28,29,30,31], 'clip': [(7146, 7361)]},\n", | |
" 'return_of_the_king': {'try_chapter': False, 'drop': [1,2,3,4,5,6,7,8,9,10,119,120,121,122,123,124], 'clip': [(4895, 5083)]},\n", | |
" 'rise_of_empire': {'try_chapter': False, 'drop': [1,2,3,4,5,6,51,52,53]},\n", | |
" 'ruin_us': {'try_chapter': False, 'drop': [1,2,3,4,5,6,102,103,104,105,106,107]},\n", | |
" 'skysworn': {'try_chapter': False, 'drop': [1,2,3,4,5,22], 'clip': [(2875, 2903)]},\n", | |
" 'soulsmith': {'try_chapter': True, 'drop': [1,2,3,4,25,26], 'clip': [(2573, 2574)]},\n", | |
" 'stone_of_tears': {'try_chapter': False, 'drop': [1]},\n", | |
" 'storm_front': {'try_chapter': True, 'drop': [1,2], 'clip': [(2208, 2234)]},\n", | |
" 'summer_knight': {'try_chapter': False, 'drop': []},\n", | |
" 'sword_of_destiny': {'try_chapter': False, 'drop': [1]},\n", | |
" 'test_of_the_twins': {'try_chapter': False, 'drop': [1,2,3,4,5,44,45], 'clip': [(2673, 2677)], 'strip_nl': True},\n", | |
" 'the_chainfire': {'try_chapter': False, 'drop': [1]},\n", | |
" 'the_confessor': {'try_chapter': False, 'drop': [1], 'clip': [(6800, 6800)]},\n", | |
" 'the_crown_tower': {'try_chapter': False, 'drop': [1,2,3,4,5,6,7,8,32,33,34,31]},\n", | |
" 'the_drowned_tomb': {'try_chapter': True, 'drop': [1,2,3,4], 'clip': [(3342, 3345)]},\n", | |
" 'the_emerald_storm': {'try_chapter': False, 'drop': [1]},\n", | |
" 'the_exercise_of_vital_powers': {'try_chapter': False, 'drop': [1,2,3,4,5,6,30,31], 'clip': [(3785, 3788)]},\n", | |
" 'the_faith_of_the_fallen': {'try_chapter': False, 'drop': [1]},\n", | |
" 'the_fires_of_heaven': {'try_chapter': True, 'drop': [1]},\n", | |
" 'the_forest_house': {'try_chapter': False, 'drop': [1,2,3,4,5,6,7,39,40]},\n", | |
" 'the_gathering_storm': {'try_chapter': False, 'drop': [1,2,3,4,5,6,7,64,65], 'clip': [(7915, 8006)]},\n", | |
" 'the_guns_of_empire': {'try_chapter': False, 'drop': [1,2,3,4,5]},\n", | |
" 'the_hero_of_ages': {'try_chapter': False, 'drop': [1,2,3,4,5,6,7,99,100,101]},\n", | |
" 'the_hobbit_or_there_and_back_again': {'try_chapter': False, 'drop': [1,24,25,26,27,28], 'clip': [(1845, 1970)], 'strip_nl': True},\n", | |
" 'the_horse_and_his_boy': {'try_chapter': False, 'drop': [1,2,23,24], 'clip': [(940, 964)]},\n", | |
" 'the_infernal_battalion': {'try_chapter': False, 'drop': [1,2,3,45,46,47,48], 'clip': [(6135, 6166)]},\n", | |
" 'the_lady_of_the_lake': {'try_chapter': False, 'drop': [1,2]},\n", | |
" 'the_last_battle': {'try_chapter': False, 'drop': [1,2,19,20,21,22,23], 'clip': [(940, 964)]},\n", | |
" 'the_last_wish': {'try_chapter': False, 'drop': [1]},\n", | |
" 'the_lies_of_locke_lamora': {'try_chapter': False, 'drop': [1], 'clip': [(4971, 4984)]},\n", | |
" 'the_mists_of_avalon': {'try_chapter': False, 'drop': [1], 'clip': [(7834, 7835)]},\n", | |
" 'the_naked_empire': {'try_chapter': False, 'drop': [1]},\n", | |
" 'the_path_of_daggers': {'try_chapter': True, 'drop': [1,2,3,4,5,6,39,40], 'clip': [(2867, 2907)]},\n", | |
" 'the_phantom': {'try_chapter': False, 'drop': [1,2], 'clip': [(5954, 5955)]},\n", | |
" 'the_price_of_valor': {'try_chapter': True, 'drop': [1,2,3,4,40], 'clip': [(6195, 6237)]},\n", | |
" 'the_princess_bride': {'try_chapter': False, 'drop': [1,2,3]},\n", | |
" 'the_republic_of_thieves': {'try_chapter': False, 'drop': [1,2,175,176,177], 'clip': [(7812, 7812)]},\n", | |
" 'the_rose_and_the_thorn': {'try_chapter': False, 'drop': [1,27,28,29], 'clip': [(3373, 3462)]},\n", | |
" 'the_shadow_of_the_gods': {'try_chapter': False, 'drop': [1,2,57,58,59], 'clip': [(5088, 5095)]},\n", | |
" 'the_shadow_rising': {'try_chapter': False, 'drop': [1,2], 'clip': [(6737, 6755)]},\n", | |
" 'the_shadow_throne': {'try_chapter': True, 'drop': [1,2,3,4,5]},\n", | |
" 'the_silver_chair': {'try_chapter': True, 'drop': [1]},\n", | |
" 'the_slow_regard_of_silent_things': {'try_chapter': True, 'drop': [1,2,3,4,5], 'clip': [(748, 805)]},\n", | |
" 'the_soul_of_the_fire': {'try_chapter': False, 'drop': [1], 'clip': [(8410, 8412)]},\n", | |
" 'the_temple_of_the_winds': {'try_chapter': False, 'drop': [1]},\n", | |
" 'the_the_omen_machine': {'try_chapter': False, 'drop': [1,88,89], 'clip': [(4469, 4516)]},\n", | |
" 'the_the_pillars_of_creation': {'try_chapter': True, 'drop': [1,2], 'clip': [(6146, 6204)]},\n", | |
" 'the_thousand_names': {'try_chapter': True, 'drop': [1,2,3,4,5,6]},\n", | |
" 'the_tower_of_swallows': {'try_chapter': False, 'drop': [1,24,25,26,27,28], 'clip': [(4355, 4497)]},\n", | |
" 'the_two_towers': {'try_chapter': True, 'drop': [1,2,3,4,5,29,30,31,32,33], 'clip': [(2787, 2834)]},\n", | |
" 'the_voyage_of_the_dawn_treader': {'try_chapter': False, 'drop': [1,2,3,20,21,22,23,24], 'clip': [(1115, 1139)]},\n", | |
" 'the_way_of_kings': {'try_chapter': False, 'drop': [1,2,3,4,5,6,7,8,9,97,98,99], 'clip': [(12698, 12843)]},\n", | |
" 'the_well_of_ascension': {'try_chapter': False, 'drop': [1,2,3,4,5,6,7,8,9,10,11,141,142,143,144,145,146], 'clip': [(9335, 9343)]},\n", | |
" 'theft_of_swords': {'try_chapter': False, 'drop': [1,2,3,4,5,6,35]},\n", | |
" 'time_of_the_twins': {'try_chapter': True, 'drop': [1,2,3], 'clip': [(3578, 3579)]},\n", | |
" 'towers_of_midnight': {'try_chapter': True, 'drop': [1,2,57], 'clip': [(10750, 10835)]},\n", | |
" 'underlord': {'try_chapter': False, 'drop': [1,2,3,27], 'clip': [(3636, 3703)]},\n", | |
" 'unsouled': {'try_chapter': True, 'drop': [1,2,3,4,25,26], 'clip': [(2492, 2493)]},\n", | |
" 'valour': {'try_chapter': False, 'drop': [1,2,3,4,5,126,127], 'clip': [(7497, 7510)]},\n", | |
" 'war_of_the_twins': {'try_chapter': True, 'drop': [1]},\n", | |
" 'wintersteel': {'try_chapter': False, 'drop': [1,2,3,32], 'clip': [(6603, 6603)]},\n", | |
" 'wintertide': {'try_chapter': True, 'drop': [1,2,3,4,5,75,76,77], 'clip': [(3092, 3112)]},\n", | |
" 'words_of_radiance': {'try_chapter': False, 'drop': [1,2,3,4,110,111,112,107,108,109], 'clip': [(15840, 15860)]},\n", | |
" 'wrath': {'try_chapter': False, 'drop': [1,2,3,4,5,133,134,135], 'clip': [(8460, 8469)]},\n", | |
"}\n", | |
"\n", | |
"all_books = pd.DataFrame([], columns=['book_name', 'paragraph_ix', 'chapter_ix', 'chapter_title', 'text', 'char_count', 'cumsum_char_count'])\n", | |
"for book in sorted(glob('source/*.epub')):\n", | |
" name = os.path.splitext(os.path.basename(book))[0]\n", | |
" print(f\"Processing {name}\")\n", | |
" try_chapter = False\n", | |
" clips = []\n", | |
" drops = []\n", | |
" strip_nl = False\n", | |
" if name in special:\n", | |
" config = special[name]\n", | |
" try_chapter = config.get('try_chapter', False)\n", | |
" if 'clip' in config:\n", | |
" clips = config['clip']\n", | |
" if 'drop' in config:\n", | |
" drops = config['drop']\n", | |
" if 'strip_nl' in config:\n", | |
" strip_nl = config['strip_nl']\n", | |
"\n", | |
" lines = parse_ebook_html(book, try_chapter=try_chapter)\n", | |
" new_frame = pd.DataFrame(lines, columns=['book_name', 'paragraph_ix', 'chapter_ix', 'chapter_title', 'text', 'char_count', 'cumsum_char_count'])\n", | |
" for drop in drops:\n", | |
" new_frame = new_frame[new_frame['chapter_ix'] != drop]\n", | |
" for clip in clips:\n", | |
" # we want to remove the paragraph id's that are in the clip range, inclusive\n", | |
" print(f\"Clipping {clip}\")\n", | |
" minv = clip[0]\n", | |
" maxv = clip[1]\n", | |
" idxr = range(minv, maxv + 1)\n", | |
" new_frame = new_frame[~new_frame['paragraph_ix'].isin(idxr)]\n", | |
" if strip_nl:\n", | |
" new_frame['text'] = new_frame['text'].str.replace('\\n', ' ').replace('\\s\\s', '')\n", | |
"\n", | |
" new_frame.reset_index(drop=True, inplace=True)\n", | |
"\n", | |
" for i in range(200):\n", | |
" row = new_frame.iloc[-i]\n", | |
" print(row['paragraph_ix'], row['text'])\n", | |
" print(end='', flush=True)\n", | |
" #input('okay')\n", | |
"\n", | |
" clear_output()\n", | |
" \n", | |
" all_books = pd.concat([all_books, new_frame.copy()], ignore_index=True)\n", | |
"all_books.to_parquet('swords.parquet')\n", | |
"print(f\"Saved {len(all_books)} paragraphs to swords.parquet\")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import os\n", | |
"import pandas as pd\n", | |
"all_books = pd.read_parquet('swords.parquet')\n", | |
"#toks = tokenizer(all_books.iloc[100]['text'], return_tensors='pt').to('cuda')\n", | |
"#model = model.to('cuda')\n", | |
"#model.sample(input_ids=toks, pad_token_id=tokenizer.pad_token_id, eos_token_id=tokenizer.eos_token_id, return_dict=True)\n", | |
"#results = [\n", | |
"# pipe(text) for text in all_books['text'][:1000]\n", | |
"#]\n", | |
"#results = pipe(*all_books['text'][:1000].tolist())\n", | |
"for book in all_books['book_name'].unique():\n", | |
" bookd = all_books[all_books['book_name'] == book]\n", | |
" if not os.path.exists(f'source/{book}.epub'):\n", | |
" continue\n", | |
" min_chapter = bookd['chapter_ix'].min()\n", | |
" max_chapter = bookd['chapter_ix'].max()\n", | |
" print(book, min_chapter, max_chapter)\n", | |
" for i in range(6):\n", | |
" ch = bookd['chapter_ix'] == min_chapter + i\n", | |
" if not ch.any():\n", | |
" print(book, min_chapter + i, 'missing')\n", | |
" continue\n", | |
" t0 = bookd[ch]['text'].iloc[0]\n", | |
" print(book, min_chapter + i, t0)\n", | |
" for i in range(6,0,-1):\n", | |
" ch = bookd['chapter_ix'] == max_chapter - i\n", | |
" if not ch.any():\n", | |
" print(book, max_chapter - i, 'missing')\n", | |
" continue\n", | |
" tn = bookd[ch]['text'].iloc[0]\n", | |
" print(book, max_chapter - i, tn)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"all_books.head(10)\n", | |
"last_ix = 0\n", | |
"tops = []\n", | |
"for i, r in all_books.iterrows():\n", | |
" if last_ix != r['chapter_ix']:\n", | |
" tops.append(i)\n", | |
" last_ix = r['chapter_ix']\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"pd.set_option('display.max_colwidth', None)\n", | |
"pd.set_option('display.max_rows', None)\n", | |
"all_books.iloc[tops]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"from glob import glob\n", | |
"import os\n", | |
"import re\n", | |
"\n", | |
"if not os.path.exists('./source'):\n", | |
" os.mkdir('./source')\n", | |
"for filename in glob('./incoming/*.epub'):\n", | |
" try:\n", | |
" basename = os.path.basename(filename)\n", | |
" # slugify\n", | |
" noext = os.path.splitext(basename)[0]\n", | |
" noext = noext.split(' - ')[1]\n", | |
" noext = re.sub(r'[^a-zA-Z0-9 ]', '', noext)\n", | |
" noext = re.sub(r'\\s+', '_', noext)\n", | |
" noext = noext.lower()\n", | |
" noext = noext[:64]\n", | |
" # Copy the file to ./source\n", | |
" os.system(f\"cp '{filename}' ./source/{noext}.epub\")\n", | |
" #print(f\"copied {filename} to {noext}.epub\")\n", | |
" print(noext, basename)\n", | |
" except OSError as e:\n", | |
" print(\"Error: %s : %s\" % (filename, e.strerror))" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "jupyterenv", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.10.12" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment