Created
December 19, 2016 18:14
-
-
Save PiotrJander/5846fcdef9251f70738ba0d7422e6a92 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
select | |
word, url, | |
ranking(page_rank, prm, prd, tfidf, tfidfm, tfidfd) as ranking, | |
normalized(page_rank, prm, prd) as page_rank_norm, | |
normalized(tfidf, tfidfm, tfidfd) as tfidf_norm, | |
page_rank, prm, prd, | |
tfidf, tfidfm, tfidfd, | |
tf, idf, term_count, max_count, doc_count, | |
normal, title, document_url, heading, stop_word, out_link, in_link, host | |
from ( | |
select | |
word, url, tfidf, page_rank, prm, prd, tfidfm, tfidfd, tf, idf, term_count, max_count, doc_count, | |
normal, title, document_url, heading, stop_word, out_link, in_link, host | |
from ( | |
select | |
word, | |
reflect("java.net.URLDecoder", "decode", doc) as url, | |
tfidf(normal, title, document_url, heading, stop_word, out_link, in_link, host, max_count, doc_count) as tfidf, | |
page_rank, | |
tf(normal, title, document_url, heading, stop_word, out_link, in_link, host, max_count) as tf, | |
idf(doc_count) as idf, | |
term_count(normal, title, document_url, heading, stop_word, out_link, in_link, host) as term_count, | |
max_count, doc_count, | |
normal, title, document_url, heading, stop_word, out_link, in_link, host | |
from hdfs_index_denormalized_doc_count where word = "gandhi" | |
order by ranking1(normal, title, document_url, heading, stop_word, out_link, in_link, host, max_count, doc_count) desc | |
limit 100 | |
) sub join ( | |
select | |
avg(page_rank) as prm, | |
stddev_pop(page_rank) as prd, | |
avg(tfidf) as tfidfm, | |
stddev_pop(tfidf) as tfidfd | |
from ( | |
select | |
word, | |
reflect("java.net.URLDecoder", "decode", doc) as url, | |
tfidf(normal, title, document_url, heading, stop_word, out_link, in_link, host, max_count, doc_count) as tfidf, | |
page_rank, | |
tf(normal, title, document_url, heading, stop_word, out_link, in_link, host, max_count) as tf, | |
idf(doc_count) as idf, | |
term_count(normal, title, document_url, heading, stop_word, out_link, in_link, host) as term_count, | |
max_count, doc_count, | |
normal, title, document_url, heading, stop_word, out_link, in_link, host | |
from hdfs_index_denormalized_doc_count where word = "gandhi" | |
order by ranking1(normal, title, document_url, heading, stop_word, out_link, in_link, host, max_count, doc_count) desc | |
limit 100 | |
) sub2 | |
) sub3 | |
) sub4 | |
order by ranking desc; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment