Last active
September 3, 2020 03:39
-
-
Save brandonko/f0d30d32f1e50014b7abd1b098316eae to your computer and use it in GitHub Desktop.
NLP Dataset Preprocessing using Large N-Grams
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import nltk | |
nltk.download('punkt') | |
import matplotlib.pyplot as plt | |
from nltk.util import ngrams | |
from nltk.tokenize import word_tokenize | |
# Helper method for generating n-grams | |
def extract_ngrams_sentences(sentences, num): | |
all_grams = [] | |
for sentence in sentences: | |
n_grams = ngrams(sentence, num) | |
all_grams += [ ' '.join(grams) for grams in n_grams] | |
return all_grams | |
# Splits text up by newline and period | |
def split_by_newline_and_period(pages): | |
sentences = [] | |
for page in pages: | |
sentences += re.split('\n|\. ', page) | |
return sentences | |
# Break the dataset up into sentences, split by newline characters and periods | |
sentences = split_by_newline_and_period(parsed_texts) | |
# Add unwanted strings into this array | |
filter_strs = [] | |
# Filter out unwanted strings | |
sentences = [x for x in sentences | |
if not any([re.search(filter_str, x, re.IGNORECASE) | |
for filter_str in filter_strs])] | |
# Tokenize the sentences | |
tokenized_sentences = [word_tokenize(sentence) for sentence in sentences] | |
# Adjust NGRAM_SIZE to capture unwanted phrases | |
NGRAM_SIZE = 15 | |
ngrams_all = extract_ngrams_sentences(tokenized_sentences, NGRAM_SIZE) | |
# Sort the n-grams by most common | |
n_gram_all = nltk.FreqDist(ngrams_all).most_common() | |
# Print out the top 10 most commmon n-grams | |
print(f'{NGRAM_SIZE}-Gram Frequencies') | |
for gram, count in n_gram_all[:10]: | |
print(f'{count}\t\"{gram}\"') | |
# Plot the distribution of n-grams | |
plt.plot([count for _, count in n_gram_all]) | |
plt.xlabel('n-gram') | |
plt.ylabel('frequency') | |
plt.title(f'{NGRAM_SIZE}-Gram Frequencies') | |
plt.show() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment