Skip to content

Instantly share code, notes, and snippets.

@xeb
Created August 31, 2020 06:08
Show Gist options
  • Save xeb/c7f6b61a96b1268f46920a4d95a49dee to your computer and use it in GitHub Desktop.
Save xeb/c7f6b61a96b1268f46920a4d95a49dee to your computer and use it in GitHub Desktop.
Process all word documents into a single text file (for GPT training)
#!/usr/bin/env python
import os
import docx2txt
from absl import app
from absl import flags
FLAGS = flags.FLAGS
flags.DEFINE_string("output", "output.txt", "The path of final output.")
flags.DEFINE_string("source", None, "The source path to process.")
flags.mark_flag_as_required("source")
def flush(text):
text = text.replace("\n\n","\n")
with open(FLAGS.output, "a") as f:
f.write(text)
def main(argv):
del argv
for root, dirs, files in os.walk(FLAGS.source):
for file in files:
if file.endswith(".docx"):
full_path = os.path.join(root, file)
text = docx2txt.process(full_path)
flush(text)
print(f"Processed {full_path}")
if __name__ == '__main__':
app.run(main)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment