Skip to content

Instantly share code, notes, and snippets.

@mikewcasale
Created March 26, 2020 05:39
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save mikewcasale/82e362ab0a15b8a830f564e04cf903f9 to your computer and use it in GitHub Desktop.
Save mikewcasale/82e362ab0a15b8a830f564e04cf903f9 to your computer and use it in GitHub Desktop.
cleanup NLP input
import re
from tqdm import tqdm_notebook as tqdm
for split in data:
with tf.io.gfile.GFile(os.path.join(DATA_DIR, split+'.txt'), 'w') as g:
for fn in tqdm(data[split]):
with open(fn, errors='ignore') as f:
text = f.read()
text = text.replace('\n', ' ').replace('\t', ' ')
ans = re.sub(' +', ' ', text)
ans = ans.replace(" ,", ",").replace(" .", ".").replace(" %", "%")
ans = ans.replace(" - ", "-").replace(" : ", ":").replace(" / ", "/")
ans = ans.replace("( ", "(").replace(" )", ")")
ans = ans.replace("`` ", "\"").replace(" ''", "\"")
ans = ans.replace(" 's", "'s").replace("s ' ", "s' ")
g.write(ans+'\n')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment