Created
July 9, 2017 01:17
-
-
Save datancoffee/86ad7176e31a0fa424341dd35b1befa5 to your computer and use it in GitHub Desktop.
Opinion Analysis query after
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
INSERT INTO opinions.stattopic (SnapshotDateId, Topic, Tags, TagCount, cntOrigPublishers, cntRepostWRs, | |
cntPositives, cntNegatives, cntAmbiguous, cntGeneral, SentimentHashes, OrigWebResourceHashes, RepostWebResourceHashes ) | |
WITH | |
p AS ( | |
SELECT 20170630 AS SnapshotDateId | |
), | |
SentimentTags AS ( | |
SELECT p.SnapshotDateId, s.SentimentHash, t.Tag, t.GoodAsTopic, s.Tags AS Tags | |
FROM p, opinions.sentiment s, UNNEST(s.Tags) AS t | |
WHERE | |
s.DocumentDateId = p.SnapshotDateId AND s.SentimentTotalScore > 0 | |
), | |
SentimentTagCombos AS ( | |
SELECT st.SnapshotDateId, st.SentimentHash, st.Tag AS Tag1, stt.Tag AS Tag2 | |
FROM SentimentTags st, UNNEST(st.Tags) stt | |
WHERE st.Tag < stt.Tag | |
), | |
CalcStatSentiments AS ( | |
SELECT st.SnapshotDateId, st.Tag, st.GoodAsTopic, d.DocumentHash AS DocumentHash, s.SentimentHash, | |
wrOrig.WebResourceHash AS OrigWebResourceHash, wrOrig.Domain AS OrigDomain, wrRepost.WebResourceHash AS RepostWebResourceHash, | |
s.DominantValence AS Valence, d.PublicationTime AS PublicationTime | |
FROM SentimentTags st | |
INNER JOIN opinions.sentiment s ON s.SentimentHash = st.SentimentHash AND s.DocumentDateId = st.SnapshotDateId | |
INNER JOIN opinions.document d ON d.DocumentHash = s.DocumentHash AND d.PublicationDateId = st.SnapshotDateId | |
INNER JOIN opinions.webresource wrOrig ON wrOrig.DocumentHash = d.DocumentHash | |
INNER JOIN opinions.webresource wrRepost ON wrRepost.DocumentCollectionId = d.DocumentCollectionId | |
AND wrRepost.CollectionItemId = d.CollectionItemId | |
), | |
CalcStatTopics AS ( | |
SELECT | |
c.SnapshotDateId, c.Tag AS Topic, [c.Tag] AS Tags, c.GoodAsTopic, 1 AS TagCount, | |
COUNT(distinct OrigDomain) as cntOrigPublishers, | |
COUNT(distinct RepostWebResourceHash) as cntRepostWRs, | |
COUNT(distinct (case when c.Valence=1 then c.SentimentHash else null end)) as cntPositives, | |
COUNT(distinct (case when c.Valence=2 then c.SentimentHash else null end)) as cntNegatives, | |
COUNT(distinct (case when c.Valence=3 then c.SentimentHash else null end)) as cntAmbiguous, | |
COUNT(distinct (case when c.Valence=5 then c.SentimentHash else null end)) as cntGeneral, | |
ARRAY_AGG(DISTINCT c.SentimentHash) AS SentimentHashes, | |
ARRAY_AGG(DISTINCT c.OrigWebResourceHash) AS OrigWebResourceHashes, | |
ARRAY_AGG(DISTINCT c.RepostWebResourceHash) AS RepostWebResourceHashes | |
FROM CalcStatSentiments c | |
GROUP BY c.SnapshotDateId, c.Tag, c.GoodAsTopic | |
), | |
CalcStatCombiTopics AS ( | |
SELECT | |
stc.SnapshotDateId, CONCAT(stc.Tag1,' & ',stc.Tag2) AS Topic, [stc.Tag1,stc.Tag2] AS Tags, true AS GoodAsTopic, 2 AS TagCount, | |
COUNT(distinct wrOrig.Domain) as cntOrigPublishers, | |
COUNT(distinct wrRepost.WebResourceHash) as cntRepostWRs, | |
COUNT(distinct (case when s.DominantValence=1 then s.SentimentHash else null end)) as cntPositives, | |
COUNT(distinct (case when s.DominantValence=2 then s.SentimentHash else null end)) as cntNegatives, | |
COUNT(distinct (case when s.DominantValence=3 then s.SentimentHash else null end)) as cntAmbiguous, | |
COUNT(distinct (case when s.DominantValence=5 then s.SentimentHash else null end)) as cntGeneral, | |
ARRAY_AGG(DISTINCT s.SentimentHash) AS SentimentHashes, | |
ARRAY_AGG(DISTINCT wrOrig.WebResourceHash) AS OrigWebResourceHashes, | |
ARRAY_AGG(DISTINCT wrRepost.WebResourceHash) AS RepostWebResourceHashes | |
FROM SentimentTagCombos stc | |
INNER JOIN opinions.sentiment s ON s.SentimentHash = stc.SentimentHash AND s.DocumentDateId = stc.SnapshotDateId | |
INNER JOIN opinions.document d ON d.DocumentHash = s.DocumentHash AND d.PublicationDateId = stc.SnapshotDateId | |
INNER JOIN opinions.webresource wrOrig ON wrOrig.DocumentHash = d.DocumentHash | |
INNER JOIN opinions.webresource wrRepost ON wrRepost.DocumentCollectionId = d.DocumentCollectionId | |
AND wrRepost.CollectionItemId = d.CollectionItemId | |
GROUP BY stc.SnapshotDateId, stc.Tag1, stc.Tag2 | |
-- HAVING cntPublisherDomains > 1 | |
), | |
CalcStatAllTopics AS ( | |
SELECT * FROM CalcStatTopics | |
WHERE GoodAsTopic = true AND cntRepostWRs > 1 | |
UNION ALL | |
SELECT * FROM CalcStatCombiTopics | |
WHERE GoodAsTopic = true AND cntRepostWRs > 1 | |
) | |
SELECT SnapshotDateId, Topic, Tags, TagCount, cntOrigPublishers, cntRepostWRs, | |
cntPositives, cntNegatives, cntAmbiguous, cntGeneral, SentimentHashes, OrigWebResourceHashes, RepostWebResourceHashes | |
FROM CalcStatAllTopics |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment