Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
NLP - Calculating Bigram Tokens
import nltk
nltk.download('punkt')
import itertools
text = "today is 'Nayan's birthday. she loves ice cream. she is also fond of cream cake. we will celebrate her birthday with ice cream cake"
sentences = nltk.sent_tokenize(text)
words = [nltk.word_tokenize(sent) for sent in sentences]
print(words)
flattened_list = list(itertools.chain(*words))
flattened_list
len(flattened_list)
# prints 28
from nltk.util import ngrams
tokens = [token for token in flattened_list if token != ""]
output = list(ngrams(tokens, 2))
output
'''
Output:
[('today', 'is'),
('is', "'Nayan"),
("'Nayan", "'s"),
("'s", 'birthday'),
('birthday', '.'),
('.', 'she'),
('she', 'loves'),
('loves', 'ice'),
('ice', 'cream'),
('cream', '.'),
('.', 'she'),
('she', 'is'),
('is', 'also'),
('also', 'fond'),
('fond', 'of'),
('of', 'cream'),
('cream', 'cake'),
('cake', '.'),
('.', 'we'),
('we', 'will'),
('will', 'celebrate'),
('celebrate', 'her'),
('her', 'birthday'),
('birthday', 'with'),
('with', 'ice'),
('ice', 'cream'),
('cream', 'cake')]
'''
len(output)
# prints 27
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment