Created
July 9, 2017 01:15
-
-
Save datancoffee/bda8fff71a435bc1de1a7793ee94f9d0 to your computer and use it in GitHub Desktop.
Opinion Analysis query before
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
INSERT INTO opinions.stattopic (SnapshotDateId, Topic, Tags, TagCount, cntOrigPublishers, cntRepostWRs, | |
cntPositives, cntNegatives, cntAmbiguous, cntGeneral, SentimentHashes, OrigWebResourceHashes, RepostWebResourceHashes ) | |
WITH | |
p AS ( | |
SELECT 20170630 AS SnapshotDateId | |
), | |
CalcStatSentiments AS ( | |
SELECT p.SnapshotDateId, t.Tag, t.GoodAsTopic, d.DocumentHash AS DocumentHash, s.SentimentHash, | |
wrOrig.WebResourceHash AS OrigWebResourceHash, wrOrig.Domain AS OrigDomain, wrRepost.WebResourceHash AS RepostWebResourceHash, | |
s.DominantValence AS Valence, d.PublicationTime AS PublicationTime | |
FROM opinions.document d, p | |
INNER JOIN opinions.sentiment s ON s.DocumentHash = d.DocumentHash, UNNEST(s.Tags) AS t | |
-- Need to use Sentiment tags, so that sentiments relate to topics | |
INNER JOIN opinions.webresource wrOrig ON wrOrig.DocumentHash = d.DocumentHash | |
INNER JOIN opinions.webresource wrRepost ON wrRepost.DocumentCollectionId = d.DocumentCollectionId | |
AND wrRepost.CollectionItemId = d.CollectionItemId | |
WHERE | |
d.PublicationDateId = p.SnapshotDateId AND s.SentimentTotalScore > 0 | |
), | |
CalcStatTopics AS ( | |
SELECT | |
c.SnapshotDateId, c.Tag AS Topic, [c.Tag] AS Tags, c.GoodAsTopic, 1 AS TagCount, | |
COUNT(distinct OrigDomain) as cntOrigPublishers, | |
COUNT(distinct RepostWebResourceHash) as cntRepostWRs, | |
COUNT(distinct (case when c.Valence=1 then c.SentimentHash else null end)) as cntPositives, | |
COUNT(distinct (case when c.Valence=2 then c.SentimentHash else null end)) as cntNegatives, | |
COUNT(distinct (case when c.Valence=3 then c.SentimentHash else null end)) as cntAmbiguous, | |
COUNT(distinct (case when c.Valence=5 then c.SentimentHash else null end)) as cntGeneral, | |
ARRAY_AGG(DISTINCT c.SentimentHash) AS SentimentHashes, | |
ARRAY_AGG(DISTINCT c.OrigWebResourceHash) AS OrigWebResourceHashes, | |
ARRAY_AGG(DISTINCT c.RepostWebResourceHash) AS RepostWebResourceHashes | |
FROM CalcStatSentiments c | |
GROUP BY c.SnapshotDateId, c.Tag, c.GoodAsTopic | |
), | |
CalcStatCombiTopics AS ( | |
SELECT | |
css1.SnapshotDateId, CONCAT(css1.Tag,' & ',css2.Tag) AS Topic, [css1.Tag,css2.Tag] AS Tags, true AS GoodAsTopic, 2 AS TagCount, | |
COUNT(distinct css1.OrigDomain) as cntOrigPublishers, | |
COUNT(distinct css1.RepostWebResourceHash) as cntRepostWRs, | |
COUNT(distinct (case when css1.Valence=1 then css1.SentimentHash else null end)) as cntPositives, | |
COUNT(distinct (case when css1.Valence=2 then css1.SentimentHash else null end)) as cntNegatives, | |
COUNT(distinct (case when css1.Valence=3 then css1.SentimentHash else null end)) as cntAmbiguous, | |
COUNT(distinct (case when css1.Valence=5 then css1.SentimentHash else null end)) as cntGeneral, | |
ARRAY_AGG(DISTINCT css1.SentimentHash) AS SentimentHashes, | |
ARRAY_AGG(DISTINCT css1.OrigWebResourceHash) AS OrigWebResourceHashes, | |
ARRAY_AGG(DISTINCT css1.RepostWebResourceHash) AS RepostWebResourceHashes | |
FROM | |
CalcStatSentiments css1, CalcStatSentiments css2 | |
WHERE | |
css1.SentimentHash = css2.SentimentHash AND | |
css1.Tag < css2.Tag | |
GROUP BY css1.SnapshotDateId, css1.Tag, css2.Tag | |
-- HAVING cntPublisherDomains > 1 | |
), | |
CalcStatAllTopics AS ( | |
SELECT * FROM CalcStatTopics | |
WHERE GoodAsTopic = true AND cntRepostWRs > 1 | |
UNION ALL | |
SELECT * FROM CalcStatCombiTopics | |
WHERE GoodAsTopic = true AND cntRepostWRs > 1 | |
) | |
SELECT SnapshotDateId, Topic, Tags, TagCount, cntOrigPublishers, cntRepostWRs, | |
cntPositives, cntNegatives, cntAmbiguous, cntGeneral, SentimentHashes, OrigWebResourceHashes, RepostWebResourceHashes | |
FROM CalcStatAllTopics |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment