Created
October 31, 2020 15:20
-
-
Save elyasha/cb3e3616136bb191ed37dbfd1510ce91 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import nltk, re | |
from nltk.tokenize import word_tokenize | |
# importing ngrams module from nltk | |
from nltk.util import ngrams | |
from collections import Counter | |
from looking_glass import looking_glass_full_text | |
cleaned = re.sub('\W+', ' ', looking_glass_full_text).lower() | |
tokenized = word_tokenize(cleaned) | |
# Change the n value to 2: | |
looking_glass_bigrams = ngrams(tokenized, 2) | |
looking_glass_bigrams_frequency = Counter(looking_glass_bigrams) | |
# Change the n value to 3: | |
looking_glass_trigrams = ngrams(tokenized, 3) | |
looking_glass_trigrams_frequency = Counter(looking_glass_trigrams) | |
# Change the n value to a number greater than 3: | |
looking_glass_ngrams = ngrams(tokenized, 8) | |
looking_glass_ngrams_frequency = Counter(looking_glass_ngrams) | |
print("Looking Glass Bigrams:") | |
print(looking_glass_bigrams_frequency.most_common(10)) | |
print("\nLooking Glass Trigrams:") | |
print(looking_glass_trigrams_frequency.most_common(10)) | |
print("\nLooking Glass n-grams:") | |
print(looking_glass_ngrams_frequency.most_common(10)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment