Skip to content

Instantly share code, notes, and snippets.

@YiLi225
Created May 5, 2022 20:50
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save YiLi225/9fa6d4617f872aa067b410459a95eac5 to your computer and use it in GitHub Desktop.
Save YiLi225/9fa6d4617f872aa067b410459a95eac5 to your computer and use it in GitHub Desktop.
from wordcloud import WordCloud, ImageColorGenerator
from nltk.corpus import stopwords
from nltk.util import ngrams
import nltk
def replace(match):
return swMapping[match.group(0)]
## Define stopwords
curSW = stopwords.words('english')
curSW += ['unk']
swMapping = dict(zip(curSW, ['']*len(curSW)))
## Raw text data
full_notes = doc_set[8319:8334]
## Remove all numbers and $ signs/punctuations:
full_notes = [''.join(filter(lambda x: not x.isdigit(), i)) for i in full_notes]; len(full_notes)
full_notes = [re.sub(r'\W+', ' ', i) for i in full_notes] ## remove all special chars
full_notes = [re.sub('|'.join(r'\b%s\b' % re.escape(s) for s in swMapping), replace, i).strip() for i in full_notes]
full_notes = [' '.join(i.split()) for i in full_notes]
## Build an ngrams model
def word_grams(words, min=2, max=5):
s = []
for n in range(min, max):
for ngram in ngrams(words, n):
s.append(' '.join(str(i) for i in ngram))
return s
grams = word_grams(str(full_notes).split(' '))
freq_grams = nltk.FreqDist(grams)
dd = sorted([[k,v] for k,v in freq_grams.items()], key=lambda x: x[1], reverse=True)
# Top 5 phrases with 2-5 words
print(dd[:5])
# [['power crystal', 6],
# ['boom bang', 5],
# ['final key', 5],
# ['super big', 4],
# ['big power', 4]]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment