Skip to content

Instantly share code, notes, and snippets.

@datancoffee
Created July 9, 2017 01:17
Show Gist options
  • Save datancoffee/86ad7176e31a0fa424341dd35b1befa5 to your computer and use it in GitHub Desktop.
Save datancoffee/86ad7176e31a0fa424341dd35b1befa5 to your computer and use it in GitHub Desktop.
Opinion Analysis query after
INSERT INTO opinions.stattopic (SnapshotDateId, Topic, Tags, TagCount, cntOrigPublishers, cntRepostWRs,
cntPositives, cntNegatives, cntAmbiguous, cntGeneral, SentimentHashes, OrigWebResourceHashes, RepostWebResourceHashes )
WITH
p AS (
SELECT 20170630 AS SnapshotDateId
),
SentimentTags AS (
SELECT p.SnapshotDateId, s.SentimentHash, t.Tag, t.GoodAsTopic, s.Tags AS Tags
FROM p, opinions.sentiment s, UNNEST(s.Tags) AS t
WHERE
s.DocumentDateId = p.SnapshotDateId AND s.SentimentTotalScore > 0
),
SentimentTagCombos AS (
SELECT st.SnapshotDateId, st.SentimentHash, st.Tag AS Tag1, stt.Tag AS Tag2
FROM SentimentTags st, UNNEST(st.Tags) stt
WHERE st.Tag < stt.Tag
),
CalcStatSentiments AS (
SELECT st.SnapshotDateId, st.Tag, st.GoodAsTopic, d.DocumentHash AS DocumentHash, s.SentimentHash,
wrOrig.WebResourceHash AS OrigWebResourceHash, wrOrig.Domain AS OrigDomain, wrRepost.WebResourceHash AS RepostWebResourceHash,
s.DominantValence AS Valence, d.PublicationTime AS PublicationTime
FROM SentimentTags st
INNER JOIN opinions.sentiment s ON s.SentimentHash = st.SentimentHash AND s.DocumentDateId = st.SnapshotDateId
INNER JOIN opinions.document d ON d.DocumentHash = s.DocumentHash AND d.PublicationDateId = st.SnapshotDateId
INNER JOIN opinions.webresource wrOrig ON wrOrig.DocumentHash = d.DocumentHash
INNER JOIN opinions.webresource wrRepost ON wrRepost.DocumentCollectionId = d.DocumentCollectionId
AND wrRepost.CollectionItemId = d.CollectionItemId
),
CalcStatTopics AS (
SELECT
c.SnapshotDateId, c.Tag AS Topic, [c.Tag] AS Tags, c.GoodAsTopic, 1 AS TagCount,
COUNT(distinct OrigDomain) as cntOrigPublishers,
COUNT(distinct RepostWebResourceHash) as cntRepostWRs,
COUNT(distinct (case when c.Valence=1 then c.SentimentHash else null end)) as cntPositives,
COUNT(distinct (case when c.Valence=2 then c.SentimentHash else null end)) as cntNegatives,
COUNT(distinct (case when c.Valence=3 then c.SentimentHash else null end)) as cntAmbiguous,
COUNT(distinct (case when c.Valence=5 then c.SentimentHash else null end)) as cntGeneral,
ARRAY_AGG(DISTINCT c.SentimentHash) AS SentimentHashes,
ARRAY_AGG(DISTINCT c.OrigWebResourceHash) AS OrigWebResourceHashes,
ARRAY_AGG(DISTINCT c.RepostWebResourceHash) AS RepostWebResourceHashes
FROM CalcStatSentiments c
GROUP BY c.SnapshotDateId, c.Tag, c.GoodAsTopic
),
CalcStatCombiTopics AS (
SELECT
stc.SnapshotDateId, CONCAT(stc.Tag1,' & ',stc.Tag2) AS Topic, [stc.Tag1,stc.Tag2] AS Tags, true AS GoodAsTopic, 2 AS TagCount,
COUNT(distinct wrOrig.Domain) as cntOrigPublishers,
COUNT(distinct wrRepost.WebResourceHash) as cntRepostWRs,
COUNT(distinct (case when s.DominantValence=1 then s.SentimentHash else null end)) as cntPositives,
COUNT(distinct (case when s.DominantValence=2 then s.SentimentHash else null end)) as cntNegatives,
COUNT(distinct (case when s.DominantValence=3 then s.SentimentHash else null end)) as cntAmbiguous,
COUNT(distinct (case when s.DominantValence=5 then s.SentimentHash else null end)) as cntGeneral,
ARRAY_AGG(DISTINCT s.SentimentHash) AS SentimentHashes,
ARRAY_AGG(DISTINCT wrOrig.WebResourceHash) AS OrigWebResourceHashes,
ARRAY_AGG(DISTINCT wrRepost.WebResourceHash) AS RepostWebResourceHashes
FROM SentimentTagCombos stc
INNER JOIN opinions.sentiment s ON s.SentimentHash = stc.SentimentHash AND s.DocumentDateId = stc.SnapshotDateId
INNER JOIN opinions.document d ON d.DocumentHash = s.DocumentHash AND d.PublicationDateId = stc.SnapshotDateId
INNER JOIN opinions.webresource wrOrig ON wrOrig.DocumentHash = d.DocumentHash
INNER JOIN opinions.webresource wrRepost ON wrRepost.DocumentCollectionId = d.DocumentCollectionId
AND wrRepost.CollectionItemId = d.CollectionItemId
GROUP BY stc.SnapshotDateId, stc.Tag1, stc.Tag2
-- HAVING cntPublisherDomains > 1
),
CalcStatAllTopics AS (
SELECT * FROM CalcStatTopics
WHERE GoodAsTopic = true AND cntRepostWRs > 1
UNION ALL
SELECT * FROM CalcStatCombiTopics
WHERE GoodAsTopic = true AND cntRepostWRs > 1
)
SELECT SnapshotDateId, Topic, Tags, TagCount, cntOrigPublishers, cntRepostWRs,
cntPositives, cntNegatives, cntAmbiguous, cntGeneral, SentimentHashes, OrigWebResourceHashes, RepostWebResourceHashes
FROM CalcStatAllTopics
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment