Skip to content

Instantly share code, notes, and snippets.

@mlai-demo
Last active August 24, 2019 03:19
Show Gist options
  • Save mlai-demo/dbebd041cd343386b0e8e73e39792203 to your computer and use it in GitHub Desktop.
Save mlai-demo/dbebd041cd343386b0e8e73e39792203 to your computer and use it in GitHub Desktop.
tokenize text
with open(fpath + '/Plutarch.txt') as f,
open(fpath + '/Plutarch2.txt', 'w') as out_f:
text = f.read().lower()
new_text = re.sub('[^a-z\.\?\!\-\'\:\;]', ' ', text) #keep only wanted characters (alphabet and select punctuation) new_text = re.sub(' +', ' ', new_text)#remove double empty spaces between words
new_text = re.sub(' +', ' ', new_text) #remove double space
new_text = re.sub('\n', ' ', new_text) #remove new line
items = [w for w in new_text.split(' ') if w.strip() != '' or w == '\n']
unique_items = set(items)
print("The text is {} words long, has {} unique items and {} characters on average\n".format
(len(items), len(unique_items), round(sum(len(word) for word in items)/len(items),2)))
print("First 1000 characters of the text:\n", new_text[:1000])
out_f.write(new_text)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment