Skip to content

Instantly share code, notes, and snippets.

@amankharwal
Created Feb 19, 2021
Embed
What would you like to do?
def ngram(token, n):
output = []
for i in range(n-1, len(token)):
ngram = ' '.join(token[i-n+1:i+1])
output.append(ngram)
return output
def create_feature(text, nrange=(1, 1)):
text_features = []
text = text.lower()
text_alphanum = re.sub('[^a-z0-9#]', ' ', text)
for n in range(nrange[0], nrange[1]+1):
text_features += ngram(text_alphanum.split(), n)
text_punc = re.sub('[a-z0-9]', ' ', text)
text_features += ngram(text_punc.split(), 1)
return Counter(text_features)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment