Last active
May 29, 2016 11:52
-
-
Save trylks/733172ee3c63082935235f85e5dbbbdb to your computer and use it in GitHub Desktop.
needing to ask: http://stackoverflow.com/a/37507447/821780
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import nltk | |
from nltk.probability import LidstoneProbDist | |
from nltk.model.ngram import NgramModel | |
import pandas as pd | |
tweets = pd.read_csv('tweeets.csv') | |
tokenize = lambda x: nltk.word_tokenize(str(x)) | |
train = [tokenize(text) for text in tweets[tweets.user == 'trylks']['text']] | |
text = tokenize("I think that the #Python library #nltk is great") | |
# here is the good part | |
estimator = lambda fdist, bins: LidstoneProbDist(fdist, 0.2) | |
model = NgramModel(3, train, estimator=estimator) | |
perplexity = model.perplexity(text) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment