BrianHung/nltk_most_frequent.py

## nltk_most_frequent.py
import nltk
nltk.download('stopwords')

from nltk          import ngrams
from nltk.tokenize import RegexpTokenizer
from nltk.corpus   import stopwords

# Faster than looking at entire list; have to lowercase input.
stop_words = set(stopwords.words("english"))

# Faster than nltk's FreqDist.
from collections import Counter

def find_most_common_ngram(text_block, exclude_numeric=False, n=1):
    """
    Given a text_block, return a Counter of the most frequent n-grams excluding stop_words.
    To find most frequent words, set n = 1 (default).
    """

    # Lowercase all characters in the input/
    text_block = text_block.lower()

    # Create regex to ignore contractions and optionally numbers.
    if exclude_numeric:
        tokenizer = RegexpTokenizer("[A-Za-z']+")
    else:
        tokenizer = RegexpTokenizer("[\w']+")

    # Split sentence into tokens.
    text_token = tokenizer.tokenize(text_block)

    # Remove tokens which are stop_words.
    text_token = [w for w in text_token if w not in stop_words]

    # Count the number of distinct tokens.
    text_count = Counter(w for w in ngrams(text_token, n))

    # Return the counter.
    return text_count
	import nltk
	nltk.download('stopwords')

	from nltk import ngrams
	from nltk.tokenize import RegexpTokenizer
	from nltk.corpus import stopwords

	# Faster than looking at entire list; have to lowercase input.
	stop_words = set(stopwords.words("english"))

	# Faster than nltk's FreqDist.
	from collections import Counter

	def find_most_common_ngram(text_block, exclude_numeric=False, n=1):
	"""
	Given a text_block, return a Counter of the most frequent n-grams excluding stop_words.
	To find most frequent words, set n = 1 (default).
	"""

	# Lowercase all characters in the input/
	text_block = text_block.lower()

	# Create regex to ignore contractions and optionally numbers.
	if exclude_numeric:
	tokenizer = RegexpTokenizer("[A-Za-z']+")
	else:
	tokenizer = RegexpTokenizer("[\w']+")

	# Split sentence into tokens.
	text_token = tokenizer.tokenize(text_block)

	# Remove tokens which are stop_words.
	text_token = [w for w in text_token if w not in stop_words]

	# Count the number of distinct tokens.
	text_count = Counter(w for w in ngrams(text_token, n))

	# Return the counter.
	return text_count