Skip to content

Instantly share code, notes, and snippets.

@martinpi
Created January 17, 2022 16:17
Show Gist options
  • Save martinpi/52fe006e33adc06ab233b4d300dcb2ec to your computer and use it in GitHub Desktop.
Save martinpi/52fe006e33adc06ab233b4d300dcb2ec to your computer and use it in GitHub Desktop.
A cleaning script for Gutenberg texts. Cleans a whole folder full of text files. Pretty rough around the edges but seems to work.
from gutenberg_cleaner import *
import getopt
import sys
import os
from pathlib import Path
argumentList = sys.argv[1:]
options = "hi:o:s"
long_options = ["help", "input", "output", "strip"]
input_dir = "."
output_filename = "output.txt"
final_content = ""
strip = False
try:
# Parsing argument
arguments, values = getopt.getopt(argumentList, options, long_options)
# checking each argument
for currentArgument, currentValue in arguments:
if currentArgument in ("-h", "--help"):
print("Displaying Help")
elif currentArgument in ("-i", "--input"):
print("Reading from:", currentValue)
input_dir = currentValue
elif currentArgument in ("-o", "--output"):
print("Writing to: ", currentValue)
output_filename = currentValue
elif currentArgument in ("-s", "--strip"):
print("Stripping newlines")
strip = True
except getopt.error as err:
# output error, and return with an error code
print(str(err))
entries = Path(input_dir)
for entry in entries.iterdir():
if not entry.name.endswith(".txt"):
continue
print("Processing file: ", entry.name)
with entry.open('r') as input_file:
cleaned_content = super_cleaner(input_file.read())
for line in cleaned_content.splitlines():
if not line.startswith('[deleted]'):
if strip:
stripped = line.rstrip()
if len(stripped) == 0:
final_content += "\n"
else:
final_content += stripped + " "
else:
final_content += line + "\n"
# super_cleaner(book: str, min_token: int = 5, max_token: int = 600) -> str
with open(output_filename, "w") as output_file:
output_file.write(final_content)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment