Skip to content

Instantly share code, notes, and snippets.

@pcmill
Created August 24, 2018 18:01
Show Gist options
  • Save pcmill/d539b1312ae17b039fa3b63c52b8dc8a to your computer and use it in GitHub Desktop.
Save pcmill/d539b1312ae17b039fa3b63c52b8dc8a to your computer and use it in GitHub Desktop.
Some code to compute the corpus size.
file = open('sentences.txt', 'r', encoding="utf8")
sentences = file.read()
def tokenize():
if sentences is not None:
words = sentences.lower().split()
return words
else:
return None
def map_book(tokens):
hash_map = {}
if tokens is not None:
for element in tokens:
# Remove Punctuation
word = element.replace(",", "")
word = word.replace(".", "")
word = word.replace("?", "")
word = word.replace(":", "")
word = word.replace("!", "")
word = word.replace(";", "")
word = word.replace("\"", "")
# Word Exist?
if word in hash_map:
hash_map[word] = hash_map[word] + 1
else:
hash_map[word] = 1
return hash_map
else:
return None
# Tokenize the Book
words = tokenize()
# Create a Hash Map (Dictionary)
map = map_book(words)
print(len(map))
# Close the sentences file
file.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment