Skip to content

Instantly share code, notes, and snippets.

@ecdedios
Last active May 30, 2020 20:12
Show Gist options
  • Save ecdedios/de48523d5b65b5a591b08d3d7edc3ba8 to your computer and use it in GitHub Desktop.
Save ecdedios/de48523d5b65b5a591b08d3d7edc3ba8 to your computer and use it in GitHub Desktop.
Basic cleaning and n-gram helper functions.
def clean(text):
"""
A simple function to clean up the data. All the words that
are not designated as a stop word is then lemmatized after
encoding and basic regex parsing are performed.
"""
wnl = nltk.stem.WordNetLemmatizer()
stopwords = nltk.corpus.stopwords.words('english') + ADDITIONAL_STOPWORDS
text = (unicodedata.normalize('NFKD', text)
.encode('ascii', 'ignore')
.decode('utf-8', 'ignore')
.lower())
words = re.sub(r'[^\w\s]', '', text).split()
return [wnl.lemmatize(word) for word in words if word not in stopwords]
def get_words(df, column):
"""
Takes in a dataframe and columns and returns a list of
words from the values in the specified column.
"""
return clean(''.join(str(df[column].tolist())))
def get_bigrams(df, column):
"""
Takes in a list of words and returns a series of
bigrams with value counts.
"""
return (pd.Series(nltk.ngrams(get_words(df, column), 2)).value_counts())[:10]
def get_trigrams(df, column):
"""
Takes in a list of words and returns a series of
trigrams with value counts.
"""
return (pd.Series(nltk.ngrams(get_words(df, column), 3)).value_counts())[:10]
def viz_bigrams(df ,column):
get_bigrams(df, column).sort_values().plot.barh(color='blue', width=.9, figsize=(12, 8))
plt.title('20 Most Frequently Occuring Bigrams')
plt.ylabel('Bigram')
plt.xlabel('# Occurances')
def viz_trigrams(df, column):
get_trigrams(df, column).sort_values().plot.barh(color='blue', width=.9, figsize=(12, 8))
plt.title('20 Most Frequently Occuring Trigrams')
plt.ylabel('Trigram')
plt.xlabel('# Occurances')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment