Skip to content

Instantly share code, notes, and snippets.

@kelciour
Created June 2, 2019 12:54
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save kelciour/cb66696845ee1a3b672f900c5fa70c65 to your computer and use it in GitHub Desktop.
Save kelciour/cb66696845ee1a3b672f900c5fa70c65 to your computer and use it in GitHub Desktop.
#!/usr/bin/python
# -*- coding: utf-8 -*-
import codecs
import glob
import os
import re
import sys
import nltk.data
pickle = nltk.data.load('tokenizers/punkt/english.pickle')
dir_name = 'sentences'
if not os.path.exists(dir_name):
os.mkdir(dir_name)
for txt_filename in glob.glob("txt/*.txt"):
sent_filename = os.path.join(dir_name, os.path.basename(txt_filename))
print txt_filename.encode('ascii', 'ignore')
with codecs.open(txt_filename, 'r', 'utf-8') as f_txt, codecs.open(sent_filename, 'w', 'utf-8') as f_sent:
data = f_txt.read()
txt = data.replace(". . .", "...")
lines = txt.splitlines()
for idx, line in enumerate(lines):
line = line.strip()
for s in pickle.tokenize(line):
f_sent.write(s)
f_sent.write("\n")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment