Skip to content

Instantly share code, notes, and snippets.

@rjurney
Created February 2, 2013 07:57
Show Gist options
  • Save rjurney/4696497 to your computer and use it in GitHub Desktop.
Save rjurney/4696497 to your computer and use it in GitHub Desktop.
Implements NTF-IDF, shout outs to Mat Kelcey who recommended this. See: http://nlp.stanford.edu/IR-book/html/htmledition/maximum-tf-normalization-1.html
DEFINE tf_idf(token_records, id_field, token_field) RETURNS out_relation {
/* Calculate the term count per document */
doc_word_totals = foreach (group $token_records by ($id_field, $token_field)) generate
FLATTEN(group) as ($id_field, token),
COUNT_STAR($token_records) as doc_total;
/* Calculate the document size */
pre_term_counts = foreach (group doc_word_totals by $id_field) generate
group AS $id_field,
FLATTEN(doc_word_totals.(token, doc_total)) as (token, doc_total),
SUM(doc_word_totals.doc_total) as doc_size;
/* Calculate the TF */
term_freqs = foreach pre_term_counts generate $id_field as $id_field,
token as token,
((double)doc_total / (double)doc_size) AS term_freq;
/* Calculate Max Freq */
max_freqs = foreach (group term_freqs by $id_field) {
sorted = order term_freqs by term_freq desc;
doc_maxes = limit sorted 1;
generate group as $id_field,
FLATTEN(doc_maxes.term_freq) as doc_max;
}
/* Calculate NTF */
n_term_freqs = join max_freqs by $id_field, term_freqs by $id_field;
n_term_freqs = foreach n_term_freqs generate term_freqs.($id_field) as $id_field,
term_freqs::token as token,
(double)((double)0.4 + (double)(1 - 0.4) * (double)((double)term_freqs::term_freq/(double)max_freqs::doc_max)) as n_term_freq:double;
/* Get count of documents using each token, for idf */
token_usages = foreach (group n_term_freqs by token) generate
FLATTEN(n_term_freqs) as ($id_field:chararray, token:chararray, n_term_freq:double),
COUNT_STAR(n_term_freqs) as num_docs_with_token;
/* Get document count */
just_ids = foreach $token_records generate $id_field;
just_ids = DISTINCT just_ids;
ndocs = foreach (group just_ids all) generate COUNT_STAR(just_ids) as total_docs;
/* Note the use of Pig Scalars to calculate idf */
$out_relation = foreach token_usages {
idf = LOG((double)ndocs.total_docs/(double)num_docs_with_token);
tf_idf = (double)n_term_freq * idf;
generate $id_field as $id_field,
token as token,
(double)tf_idf as score:double;
};
};
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment