YiLi225/ngrams.py

## ngrams.py
from wordcloud import WordCloud, ImageColorGenerator
from nltk.corpus import stopwords
from nltk.util import ngrams
import nltk

def replace(match):
    return swMapping[match.group(0)]
## Define stopwords
curSW = stopwords.words('english')
curSW += ['unk']
swMapping = dict(zip(curSW, ['']*len(curSW)))

## Raw text data
full_notes = doc_set[8319:8334]

## Remove all numbers and $ signs/punctuations:
full_notes = [''.join(filter(lambda x: not x.isdigit(), i)) for i in full_notes]; len(full_notes)
full_notes = [re.sub(r'\W+', ' ', i) for i in full_notes] ## remove all special chars
full_notes = [re.sub('|'.join(r'\b%s\b' % re.escape(s) for s in swMapping), replace, i).strip() for i in full_notes]
full_notes = [' '.join(i.split()) for i in full_notes]

## Build an ngrams model
def word_grams(words, min=2, max=5):
    s = []
    for n in range(min, max):
        for ngram in ngrams(words, n):
            s.append(' '.join(str(i) for i in ngram))
    return s

grams = word_grams(str(full_notes).split(' '))
freq_grams = nltk.FreqDist(grams)

dd = sorted([[k,v] for k,v in freq_grams.items()], key=lambda x: x[1], reverse=True)
# Top 5 phrases with 2-5 words
print(dd[:5])
# [['power crystal', 6],
#  ['boom bang', 5],
#  ['final key', 5],
#  ['super big', 4],
#  ['big power', 4]]
	from wordcloud import WordCloud, ImageColorGenerator
	from nltk.corpus import stopwords
	from nltk.util import ngrams
	import nltk

	def replace(match):
	return swMapping[match.group(0)]
	## Define stopwords
	curSW = stopwords.words('english')
	curSW += ['unk']
	swMapping = dict(zip(curSW, ['']*len(curSW)))

	## Raw text data
	full_notes = doc_set[8319:8334]

	## Remove all numbers and $ signs/punctuations:
	full_notes = [''.join(filter(lambda x: not x.isdigit(), i)) for i in full_notes]; len(full_notes)
	full_notes = [re.sub(r'\W+', ' ', i) for i in full_notes] ## remove all special chars
	full_notes = [re.sub('\|'.join(r'\b%s\b' % re.escape(s) for s in swMapping), replace, i).strip() for i in full_notes]
	full_notes = [' '.join(i.split()) for i in full_notes]

	## Build an ngrams model
	def word_grams(words, min=2, max=5):
	s = []
	for n in range(min, max):
	for ngram in ngrams(words, n):
	s.append(' '.join(str(i) for i in ngram))
	return s

	grams = word_grams(str(full_notes).split(' '))
	freq_grams = nltk.FreqDist(grams)

	dd = sorted([[k,v] for k,v in freq_grams.items()], key=lambda x: x[1], reverse=True)
	# Top 5 phrases with 2-5 words
	print(dd[:5])
	# [['power crystal', 6],
	# ['boom bang', 5],
	# ['final key', 5],
	# ['super big', 4],
	# ['big power', 4]]