Skip to content

Instantly share code, notes, and snippets.

@TerrorJack
Last active May 20, 2023 01:58
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save TerrorJack/a7eeb210eb45a223de29a0cb7c9475f6 to your computer and use it in GitHub Desktop.
Save TerrorJack/a7eeb210eb45a223de29a0cb7c9475f6 to your computer and use it in GitHub Desktop.
{ pkgs ? import (builtins.getFlake "nixpkgs") {} }:
pkgs.mkShell {
nativeBuildInputs = [
(pkgs.python3.withPackages (ps: with ps; [stanza]))
];
}
#!/usr/bin/env python3
import stanza
import os
def sentences(text):
nlp = stanza.Pipeline(lang='en', processors='tokenize')
doc = nlp(text)
return [[token.text for token in sentence.tokens] for sentence in doc.sentences]
def reformat(doc):
acc = ''
for sentence in doc:
acc += sentence[0]
for i in range(1,len(sentence)):
token = sentence[i]
if token[0].isalpha():
acc += ' '
acc += token
acc += '\n\n'
return acc
def reformat_inplace(p):
with open(p, 'r') as f:
text = f.read()
new_text = reformat(sentences(text))
with open(p, 'w') as f:
f.write(new_text)
def reformat_dir(p):
for f in os.listdir(p):
if not f.endswith('.txt'):
continue
print('[INFO] Processing ' + f)
reformat_inplace(p + '/' + f)
if __name__ == "__main__":
reformat_dir('/Users/terrorjack/todo')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment