Skip to content

Instantly share code, notes, and snippets.

@PiotrJander
Created December 19, 2016 18:14
Show Gist options
  • Save PiotrJander/5846fcdef9251f70738ba0d7422e6a92 to your computer and use it in GitHub Desktop.
Save PiotrJander/5846fcdef9251f70738ba0d7422e6a92 to your computer and use it in GitHub Desktop.
select
word, url,
ranking(page_rank, prm, prd, tfidf, tfidfm, tfidfd) as ranking,
normalized(page_rank, prm, prd) as page_rank_norm,
normalized(tfidf, tfidfm, tfidfd) as tfidf_norm,
page_rank, prm, prd,
tfidf, tfidfm, tfidfd,
tf, idf, term_count, max_count, doc_count,
normal, title, document_url, heading, stop_word, out_link, in_link, host
from (
select
word, url, tfidf, page_rank, prm, prd, tfidfm, tfidfd, tf, idf, term_count, max_count, doc_count,
normal, title, document_url, heading, stop_word, out_link, in_link, host
from (
select
word,
reflect("java.net.URLDecoder", "decode", doc) as url,
tfidf(normal, title, document_url, heading, stop_word, out_link, in_link, host, max_count, doc_count) as tfidf,
page_rank,
tf(normal, title, document_url, heading, stop_word, out_link, in_link, host, max_count) as tf,
idf(doc_count) as idf,
term_count(normal, title, document_url, heading, stop_word, out_link, in_link, host) as term_count,
max_count, doc_count,
normal, title, document_url, heading, stop_word, out_link, in_link, host
from hdfs_index_denormalized_doc_count where word = "gandhi"
order by ranking1(normal, title, document_url, heading, stop_word, out_link, in_link, host, max_count, doc_count) desc
limit 100
) sub join (
select
avg(page_rank) as prm,
stddev_pop(page_rank) as prd,
avg(tfidf) as tfidfm,
stddev_pop(tfidf) as tfidfd
from (
select
word,
reflect("java.net.URLDecoder", "decode", doc) as url,
tfidf(normal, title, document_url, heading, stop_word, out_link, in_link, host, max_count, doc_count) as tfidf,
page_rank,
tf(normal, title, document_url, heading, stop_word, out_link, in_link, host, max_count) as tf,
idf(doc_count) as idf,
term_count(normal, title, document_url, heading, stop_word, out_link, in_link, host) as term_count,
max_count, doc_count,
normal, title, document_url, heading, stop_word, out_link, in_link, host
from hdfs_index_denormalized_doc_count where word = "gandhi"
order by ranking1(normal, title, document_url, heading, stop_word, out_link, in_link, host, max_count, doc_count) desc
limit 100
) sub2
) sub3
) sub4
order by ranking desc;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment