Skip to content

Instantly share code, notes, and snippets.

@swablueme
Last active March 28, 2021 05:35
Show Gist options
  • Save swablueme/6cb5345f95de9282fd9a8e4ba908ef83 to your computer and use it in GitHub Desktop.
Save swablueme/6cb5345f95de9282fd9a8e4ba908ef83 to your computer and use it in GitHub Desktop.
.txt files to epub
# -*- coding: utf-8 -*-
import os
import io
import pypub
import re
import logging
import time
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)
FILENAME="new epub.epub"
class retrywrapper:
"""wrapper for retrying stuff like deleting files etc"""
@staticmethod
def retry(func):
def funcrun(*args, **kwargs):
tries=5
delay=1
while tries > 0:
try:
return func(*args, **kwargs)
except Exception as e:
logger.error(str(e), exc_info=True)
print(*args, **kwargs)
tries-=1
time.sleep(delay)
delay=delay*2
return funcrun
@retrywrapper.retry
def del_file():
#Deletes file
if os.path.exists(FILENAME):
os. remove(FILENAME)
#singlelined is a tuple of filenames that should be treated
#as if it has no paragraph breaks
def open_file(singlelined=()):
#opens each file for processing
del_file()
#creates an epub
epub = pypub.Epub('new epub')
#adds every .txt file not called patterns.txt as chapters in a book
filelist=sorted([file for file in os.listdir(os.getcwd()) if file.endswith(".txt") and file!="patterns.txt"], key=extract_num)
for file in filelist:
if file in singlelined:
#on some protected google drive docs, there are no paragraph breaks so the text is a solid wall of text
#if singleline is true, the program will look for punctuation at the end
parse_file(file, singleline=True)
else:
parse_file(file, singleline=False)
#creates a chapter from each .txt file
create_epub_ch(epub, file)
paths=os.getcwd()
epub.create_epub(paths)
def extract_num(text):
"""extracts the chapter number to put in the Table of Contents"""
#the first number found in the filename is the "chapter number"
return int(re.search('.*?(\d+).*?', text, re.IGNORECASE).group(1))
def parse_file(file, singleline=False):
#make an output directory for cleaned .txt files
if not os.path.exists("output"):
os.makedirs("output")
#cleaned .txt files have "_fixed.txt" appended to the end of the filename
with open(os.path.join("output", file+"_fixed.txt"), "w", encoding='utf-8') as g:
with open(file, "r", encoding='utf-8') as f:
collector=[]
cleaned_text=""
for line in f:
#replace some junk unicode
line_spaces=line.replace(u"\u200c", "")
#replace linebreaks
line=line_spaces.rstrip().strip("\n\n")
if len(line)!=0:
#if there is punctuation at the end of a line, count it as a "paragraph"
if singleline == True and re.search(r"[\.|!|?|\"|\”]\s{0,}$", line):
collector.append(line)
cleaned_text+=" ".join(collector)
cleaned_text+="\n\n"
collector=[]
else:
collector.append(line)
else:
#when obtaining text from google docs, paragraphs may be broken up into
#individual lines, reconnect these lines if they come before two newlines
cleaned_text+=" ".join(collector)
cleaned_text+="\n\n"
collector=[]
if collector:
#if any remaining text is left in the collector, for example it's at the very
#end of the document so there are no more two newlines to tell the program
#to add it in, add the remaining text into the chapter
cleaned_text+=" ".join(collector)
cleaned_text+="\n\n"
cleaned_text=clean_text(cleaned_text)
g.write(cleaned_text)
def create_epub_ch(epub, file):
#pypub only accepts html, collect .txt file text and put it into html
message="""<html>
<head></head>
<body>%s</body>
</html>"""
added_text=[]
#from the cleaned (txt files ending in "_fixed.txt")
with open(os.path.join("output", file+"_fixed.txt"), "r", encoding='utf-8') as f:
for line in f.read().split('\n'):
if line:
text_adding="<p>"+line+"</p>"
added_text.append(text_adding)
message=message%"".join(added_text)
#create the chapters
chapter=pypub.create_chapter_from_string(message, url=None, title=str(extract_num(file)))
epub.add_chapter(chapter)
def clean_text(text):
"""regex cleaning function"""
#patterns.txt describes regex patterns and their replacement
#the first line is always the pattern to be replaced
#the second line is always the substitution
#the third line is blank. Or it can be Capture (which tells the program
#to replace with a captured group)
#regex replacements occur in order, so the first/second/third line describes
#the first replacement, the fourth/fifth/six line is the second replacement etc
patterns=open("patterns.txt", "r", encoding='utf-8').read().split('\n')
for i in range(len(patterns)//3):
pattern=re.compile(patterns[i*3],re.DOTALL)
if patterns[i*3+2] == "Capture":
return re.search(pattern, text).group(1)
else:
replacement= patterns[i*3+1]
text=pattern.sub(replacement,text)
return text
if __name__ == "__main__":
open_file(())
@swablueme
Copy link
Author

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment