Skip to content

Instantly share code, notes, and snippets.

@todd-cook
Created December 4, 2017 19:52
Show Gist options
  • Save todd-cook/8cfb1df8332b14113b877325de4bb45b to your computer and use it in GitHub Desktop.
Save todd-cook/8cfb1df8332b14113b877325de4bb45b to your computer and use it in GitHub Desktop.
deboilerplatify
import argparse
from textblob import TextBlob
def deboilerplatify(blob):
"""PG boilerplates off"""
end_of_top_boilerplate = "Produced by Anonymous Volunteers"
pos = blob.find(end_of_top_boilerplate)
if pos:
blob = blob[pos + len(end_of_top_boilerplate):].strip()
start_of_end_boilerplate = "End of the Project Gutenberg EBook"
pos = blob.rfind(start_of_end_boilerplate)
if pos:
blob = blob[:pos].strip()
return blob
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="TODO",
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument(
'-i', '--infile', default="1342-0.txt",
help="Input filename")
args = parser.parse_args()
print("open")
with open(args.infile) as f:
blob = TextBlob(f.read())
blob = deboilerplatify(blob)
print("start:", blob[:100])
print("end:", blob[len(blob) - 100:])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment