Created
December 11, 2020 10:37
-
-
Save awan1/714f9d166bde8e2aadc4c2cb28f4c8da to your computer and use it in GitHub Desktop.
Python code for converting Daystar Eld's "Pokemon: The Origin of Species" to Markdown
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# You'll need to `pip install html2text` | |
import urllib.request | |
import html2text | |
def get_text_from_url(url): | |
fp = urllib.request.urlopen(url) | |
url_bytes = fp.read() | |
url_str = url_bytes.decode("utf8") | |
fp.close() | |
return html2text.html2text(url_str) | |
# These strings mark the start and end of the chapter, as formatted on daystareld.com | |
chapter_start_str = "# Chapter" | |
chapter_end_str = "[](https://mistape.com)" | |
start_chapter = 1 | |
end_chapter = 87 | |
def get_chapter(i): | |
txt = get_text_from_url("http://daystareld.com/pokemon-{}/".format(i)) | |
return txt[txt.find(chapter_start_str):txt.find(chapter_end_str)] | |
# If you want to write one chapter per file | |
def write_all_chapters_to_files(): | |
for i in range(start_chapter, end_chapter+1): | |
print("Getting chapter {}".format(i)) | |
txt = get_chapter(i) | |
with open("pokemon-toos-ch{}.md".format(i), "w") as f: | |
f.write(txt) | |
# To combine all chapters into one | |
def combine_saved_chapters(): | |
complete_text = "" | |
for i in range(start_chapter, end_chapter+1): | |
with open("pokemon-toos-ch{}.md".format(i), "r") as f: | |
complete_text += f.read() | |
# Add newlines between chapters | |
complete_text += "\n\n" | |
with open("pokemon-toos-ch{}-{}.md".format(start_chapter, end_chapter), "w") as f: | |
f.write(complete_text) | |
# Now you have a complete markdown file in `pokemon-toos-ch1-87.md`. | |
# This can be converted into PDF or EPUB as you like. |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment