Navigation Menu

Skip to content

Instantly share code, notes, and snippets.

@marcelcaraciolo
Created January 12, 2012 21:27
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save marcelcaraciolo/1603237 to your computer and use it in GitHub Desktop.
Save marcelcaraciolo/1603237 to your computer and use it in GitHub Desktop.
term frequencies
auth = OAuthHandler(CLIENT_ID, CLIENT_SECRET, CALLBACK)
auth.set_access_token(ACCESS_TOKEN)
api = API(auth)
venue = api.venues(id='4bd47eeb5631c9b69672a230')
stopwords = nltk.corpus.stopwords.words('portuguese')
tokenizer = RegexpTokenizer("[\w’]+", flags=re.UNICODE)
def freq(word, tokens):
return tokens.count(word)
def word_count(tokens):
return len(tokens)
def tf(word, tokens):
return (freq(word, tokens) / float(word_count(tokens)))
#Compute the frequency for each term.
vocabulary = []
docs = {}
all_tips = []
for tip in (venue.tips()):
tokens = tokenizer.tokenize(tip.text)
bi_tokens = bigrams(tokens)
tri_tokens = trigrams(tokens)
tokens = [token.lower() for token in tokens if len(token) > 2]
tokens = [token for token in tokens if token not in stopwords]
bi_tokens = [' '.join(token).lower() for token in bi_tokens]
bi_tokens = [token for token in bi_tokens if token not in stopwords]
tri_tokens = [' '.join(token).lower() for token in tri_tokens]
tri_tokens = [token for token in tri_tokens if token not in stopwords]
final_tokens = []
final_tokens.extend(tokens)
final_tokens.extend(bi_tokens)
final_tokens.extend(tri_tokens)
docs[tip.text] = {'freq': {}, 'tf': {}}
for token in final_tokens:
#The Frequency computed for each tip
docs[tip.text]['freq'][token] = freq(token, final_tokens)
#The True-Frequency (Normalized Frequency)
docs[tip.text]['tf'][token] = tf(token, final_tokens)
print docs
@sunnyglzg
Copy link

Where can I find the tutorial where you explain how to use this piece of code?

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment