Skip to content

Instantly share code, notes, and snippets.

@datancoffee
Created July 9, 2017 01:15
Show Gist options
  • Save datancoffee/bda8fff71a435bc1de1a7793ee94f9d0 to your computer and use it in GitHub Desktop.
Save datancoffee/bda8fff71a435bc1de1a7793ee94f9d0 to your computer and use it in GitHub Desktop.
Opinion Analysis query before
INSERT INTO opinions.stattopic (SnapshotDateId, Topic, Tags, TagCount, cntOrigPublishers, cntRepostWRs,
cntPositives, cntNegatives, cntAmbiguous, cntGeneral, SentimentHashes, OrigWebResourceHashes, RepostWebResourceHashes )
WITH
p AS (
SELECT 20170630 AS SnapshotDateId
),
CalcStatSentiments AS (
SELECT p.SnapshotDateId, t.Tag, t.GoodAsTopic, d.DocumentHash AS DocumentHash, s.SentimentHash,
wrOrig.WebResourceHash AS OrigWebResourceHash, wrOrig.Domain AS OrigDomain, wrRepost.WebResourceHash AS RepostWebResourceHash,
s.DominantValence AS Valence, d.PublicationTime AS PublicationTime
FROM opinions.document d, p
INNER JOIN opinions.sentiment s ON s.DocumentHash = d.DocumentHash, UNNEST(s.Tags) AS t
-- Need to use Sentiment tags, so that sentiments relate to topics
INNER JOIN opinions.webresource wrOrig ON wrOrig.DocumentHash = d.DocumentHash
INNER JOIN opinions.webresource wrRepost ON wrRepost.DocumentCollectionId = d.DocumentCollectionId
AND wrRepost.CollectionItemId = d.CollectionItemId
WHERE
d.PublicationDateId = p.SnapshotDateId AND s.SentimentTotalScore > 0
),
CalcStatTopics AS (
SELECT
c.SnapshotDateId, c.Tag AS Topic, [c.Tag] AS Tags, c.GoodAsTopic, 1 AS TagCount,
COUNT(distinct OrigDomain) as cntOrigPublishers,
COUNT(distinct RepostWebResourceHash) as cntRepostWRs,
COUNT(distinct (case when c.Valence=1 then c.SentimentHash else null end)) as cntPositives,
COUNT(distinct (case when c.Valence=2 then c.SentimentHash else null end)) as cntNegatives,
COUNT(distinct (case when c.Valence=3 then c.SentimentHash else null end)) as cntAmbiguous,
COUNT(distinct (case when c.Valence=5 then c.SentimentHash else null end)) as cntGeneral,
ARRAY_AGG(DISTINCT c.SentimentHash) AS SentimentHashes,
ARRAY_AGG(DISTINCT c.OrigWebResourceHash) AS OrigWebResourceHashes,
ARRAY_AGG(DISTINCT c.RepostWebResourceHash) AS RepostWebResourceHashes
FROM CalcStatSentiments c
GROUP BY c.SnapshotDateId, c.Tag, c.GoodAsTopic
),
CalcStatCombiTopics AS (
SELECT
css1.SnapshotDateId, CONCAT(css1.Tag,' & ',css2.Tag) AS Topic, [css1.Tag,css2.Tag] AS Tags, true AS GoodAsTopic, 2 AS TagCount,
COUNT(distinct css1.OrigDomain) as cntOrigPublishers,
COUNT(distinct css1.RepostWebResourceHash) as cntRepostWRs,
COUNT(distinct (case when css1.Valence=1 then css1.SentimentHash else null end)) as cntPositives,
COUNT(distinct (case when css1.Valence=2 then css1.SentimentHash else null end)) as cntNegatives,
COUNT(distinct (case when css1.Valence=3 then css1.SentimentHash else null end)) as cntAmbiguous,
COUNT(distinct (case when css1.Valence=5 then css1.SentimentHash else null end)) as cntGeneral,
ARRAY_AGG(DISTINCT css1.SentimentHash) AS SentimentHashes,
ARRAY_AGG(DISTINCT css1.OrigWebResourceHash) AS OrigWebResourceHashes,
ARRAY_AGG(DISTINCT css1.RepostWebResourceHash) AS RepostWebResourceHashes
FROM
CalcStatSentiments css1, CalcStatSentiments css2
WHERE
css1.SentimentHash = css2.SentimentHash AND
css1.Tag < css2.Tag
GROUP BY css1.SnapshotDateId, css1.Tag, css2.Tag
-- HAVING cntPublisherDomains > 1
),
CalcStatAllTopics AS (
SELECT * FROM CalcStatTopics
WHERE GoodAsTopic = true AND cntRepostWRs > 1
UNION ALL
SELECT * FROM CalcStatCombiTopics
WHERE GoodAsTopic = true AND cntRepostWRs > 1
)
SELECT SnapshotDateId, Topic, Tags, TagCount, cntOrigPublishers, cntRepostWRs,
cntPositives, cntNegatives, cntAmbiguous, cntGeneral, SentimentHashes, OrigWebResourceHashes, RepostWebResourceHashes
FROM CalcStatAllTopics
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment