Skip to content

Instantly share code, notes, and snippets.

@amankharwal
Created Oct 4, 2020
Embed
What would you like to do?
import pandas as pd
import numpy as np
import textdistance
import re
from collections import Counter
words = []
with open('moby.txt', 'r') as f:
file_name_data = f.read()
file_name_data=file_name_data.lower()
words = re.findall('\w+',file_name_data)
# This is our vocabulary
V = set(words)
print(f"The first ten words in the text are: \n{words[0:10]}")
print(f"There are {len(V)} unique words in the vocabulary.")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment