term frequencies
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
auth = OAuthHandler(CLIENT_ID, CLIENT_SECRET, CALLBACK) | |
auth.set_access_token(ACCESS_TOKEN) | |
api = API(auth) | |
venue = api.venues(id='4bd47eeb5631c9b69672a230') | |
stopwords = nltk.corpus.stopwords.words('portuguese') | |
tokenizer = RegexpTokenizer("[\w’]+", flags=re.UNICODE) | |
def freq(word, tokens): | |
return tokens.count(word) | |
#Compute the frequency for each term. | |
vocabulary = [] | |
docs = {} | |
all_tips = [] | |
for tip in (venue.tips()): | |
tokens = tokenizer.tokenize(tip.text) | |
bi_tokens = bigrams(tokens) | |
tri_tokens = trigrams(tokens) | |
tokens = [token.lower() for token in tokens if len(token) > 2] | |
tokens = [token for token in tokens if token not in stopwords] | |
bi_tokens = [' '.join(token).lower() for token in bi_tokens] | |
bi_tokens = [token for token in bi_tokens if token not in stopwords] | |
tri_tokens = [' '.join(token).lower() for token in tri_tokens] | |
tri_tokens = [token for token in tri_tokens if token not in stopwords] | |
final_tokens = [] | |
final_tokens.extend(tokens) | |
final_tokens.extend(bi_tokens) | |
final_tokens.extend(tri_tokens) | |
docs[tip.text] = {'freq': {}} | |
for token in final_tokens: | |
docs[tip.text]['freq'][token] = freq(token, final_tokens) | |
print docs |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment