marcelcaraciolo/term_frequency_normalization.py

## term_frequency_normalization.py

auth = OAuthHandler(CLIENT_ID, CLIENT_SECRET, CALLBACK)
auth.set_access_token(ACCESS_TOKEN)
api = API(auth)


venue = api.venues(id='4bd47eeb5631c9b69672a230')
stopwords = nltk.corpus.stopwords.words('portuguese')
tokenizer = RegexpTokenizer("[\w’]+", flags=re.UNICODE)


def freq(word, tokens):
    return tokens.count(word)


def word_count(tokens):
    return len(tokens)


def tf(word, tokens):
    return (freq(word, tokens) / float(word_count(tokens)))

#Compute the frequency for each term.
vocabulary = []
docs = {}
all_tips = []
for tip in (venue.tips()):
    tokens = tokenizer.tokenize(tip.text)

    bi_tokens = bigrams(tokens)
    tri_tokens = trigrams(tokens)
    tokens = [token.lower() for token in tokens if len(token) > 2]
    tokens = [token for token in tokens if token not in stopwords]

    bi_tokens = [' '.join(token).lower() for token in bi_tokens]
    bi_tokens = [token for token in bi_tokens if token not in stopwords]

    tri_tokens = [' '.join(token).lower() for token in tri_tokens]
    tri_tokens = [token for token in tri_tokens if token not in stopwords]

    final_tokens = []
    final_tokens.extend(tokens)
    final_tokens.extend(bi_tokens)
    final_tokens.extend(tri_tokens)
    docs[tip.text] = {'freq': {}, 'tf': {}}

    for token in final_tokens:
        #The Frequency computed for each tip
        docs[tip.text]['freq'][token] = freq(token, final_tokens)
        #The True-Frequency (Normalized Frequency)
        docs[tip.text]['tf'][token] = tf(token, final_tokens)


print docs

	auth = OAuthHandler(CLIENT_ID, CLIENT_SECRET, CALLBACK)
	auth.set_access_token(ACCESS_TOKEN)
	api = API(auth)


	venue = api.venues(id='4bd47eeb5631c9b69672a230')
	stopwords = nltk.corpus.stopwords.words('portuguese')
	tokenizer = RegexpTokenizer("[\w’]+", flags=re.UNICODE)


	def freq(word, tokens):
	return tokens.count(word)


	def word_count(tokens):
	return len(tokens)


	def tf(word, tokens):
	return (freq(word, tokens) / float(word_count(tokens)))

	#Compute the frequency for each term.
	vocabulary = []
	docs = {}
	all_tips = []
	for tip in (venue.tips()):
	tokens = tokenizer.tokenize(tip.text)

	bi_tokens = bigrams(tokens)
	tri_tokens = trigrams(tokens)
	tokens = [token.lower() for token in tokens if len(token) > 2]
	tokens = [token for token in tokens if token not in stopwords]

	bi_tokens = [' '.join(token).lower() for token in bi_tokens]
	bi_tokens = [token for token in bi_tokens if token not in stopwords]

	tri_tokens = [' '.join(token).lower() for token in tri_tokens]
	tri_tokens = [token for token in tri_tokens if token not in stopwords]

	final_tokens = []
	final_tokens.extend(tokens)
	final_tokens.extend(bi_tokens)
	final_tokens.extend(tri_tokens)
	docs[tip.text] = {'freq': {}, 'tf': {}}

	for token in final_tokens:
	#The Frequency computed for each tip
	docs[tip.text]['freq'][token] = freq(token, final_tokens)
	#The True-Frequency (Normalized Frequency)
	docs[tip.text]['tf'][token] = tf(token, final_tokens)


	print docs