ecdedios/ngram_functions.py

## ngram_functions.py
def clean(text):
  """
  A simple function to clean up the data. All the words that
  are not designated as a stop word is then lemmatized after
  encoding and basic regex parsing are performed.
  """
  wnl = nltk.stem.WordNetLemmatizer()
  stopwords = nltk.corpus.stopwords.words('english') + ADDITIONAL_STOPWORDS
  text = (unicodedata.normalize('NFKD', text)
    .encode('ascii', 'ignore')
    .decode('utf-8', 'ignore')
    .lower())
  words = re.sub(r'[^\w\s]', '', text).split()
  return [wnl.lemmatize(word) for word in words if word not in stopwords]

def get_words(df, column):
    """
    Takes in a dataframe and columns and returns a list of
    words from the values in the specified column.
    """
    return clean(''.join(str(df[column].tolist())))

def get_bigrams(df, column):
    """
    Takes in a list of words and returns a series of
    bigrams with value counts.
    """
    return (pd.Series(nltk.ngrams(get_words(df, column), 2)).value_counts())[:10]

def get_trigrams(df, column):
    """
    Takes in a list of words and returns a series of
    trigrams with value counts.
    """
    return (pd.Series(nltk.ngrams(get_words(df, column), 3)).value_counts())[:10]

def viz_bigrams(df ,column):
    get_bigrams(df, column).sort_values().plot.barh(color='blue', width=.9, figsize=(12, 8))

    plt.title('20 Most Frequently Occuring Bigrams')
    plt.ylabel('Bigram')
    plt.xlabel('# Occurances')

def viz_trigrams(df, column):
    get_trigrams(df, column).sort_values().plot.barh(color='blue', width=.9, figsize=(12, 8))

    plt.title('20 Most Frequently Occuring Trigrams')
    plt.ylabel('Trigram')
    plt.xlabel('# Occurances')
	def clean(text):
	"""
	A simple function to clean up the data. All the words that
	are not designated as a stop word is then lemmatized after
	encoding and basic regex parsing are performed.
	"""
	wnl = nltk.stem.WordNetLemmatizer()
	stopwords = nltk.corpus.stopwords.words('english') + ADDITIONAL_STOPWORDS
	text = (unicodedata.normalize('NFKD', text)
	.encode('ascii', 'ignore')
	.decode('utf-8', 'ignore')
	.lower())
	words = re.sub(r'[^\w\s]', '', text).split()
	return [wnl.lemmatize(word) for word in words if word not in stopwords]

	def get_words(df, column):
	"""
	Takes in a dataframe and columns and returns a list of
	words from the values in the specified column.
	"""
	return clean(''.join(str(df[column].tolist())))

	def get_bigrams(df, column):
	"""
	Takes in a list of words and returns a series of
	bigrams with value counts.
	"""
	return (pd.Series(nltk.ngrams(get_words(df, column), 2)).value_counts())[:10]

	def get_trigrams(df, column):
	"""
	Takes in a list of words and returns a series of
	trigrams with value counts.
	"""
	return (pd.Series(nltk.ngrams(get_words(df, column), 3)).value_counts())[:10]

	def viz_bigrams(df ,column):
	get_bigrams(df, column).sort_values().plot.barh(color='blue', width=.9, figsize=(12, 8))

	plt.title('20 Most Frequently Occuring Bigrams')
	plt.ylabel('Bigram')
	plt.xlabel('# Occurances')

	def viz_trigrams(df, column):
	get_trigrams(df, column).sort_values().plot.barh(color='blue', width=.9, figsize=(12, 8))

	plt.title('20 Most Frequently Occuring Trigrams')
	plt.ylabel('Trigram')
	plt.xlabel('# Occurances')