Created
August 4, 2023 22:05
-
-
Save maxpollard/fafdbd6460b125d7582b5f595ef97d59 to your computer and use it in GitHub Desktop.
Sensitive Content queries using Snowflake + Material
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
CREATE OR REPLACE FUNCTION EXTRACT_RULE_ID(versionedRuleId STRING) | |
RETURNS STRING | |
LANGUAGE JAVASCRIPT | |
AS 'return versionedRuleId.substring(0, versionedRuleId.indexOf("@") - 1);'; | |
CREATE OR REPLACE FUNCTION GET_DISTINCT_RULES_MATCHED(arr1 ARRAY, arr2 ARRAY) | |
RETURNS ARRAY | |
LANGUAGE JAVASCRIPT | |
AS ' | |
var result = []; | |
var seen = new Set(); | |
for (var i = 0; i < ARR1.length; i++) { | |
var obj = ARR1[i]; | |
if (obj.hasOwnProperty("rulesMatched")) { | |
var rulesMatched = obj.rulesMatched; | |
for (var j = 0; j < rulesMatched.length; j++) { | |
var ruleId = rulesMatched[j]; | |
if (!seen.has(ruleId)) { | |
result.push(ruleId.substring(0, ruleId.indexOf("@"))); | |
seen.add(ruleId); | |
} | |
} | |
} | |
} | |
for (var k = 0; k < ARR2.length; k++) { | |
var obj = ARR2[k]; | |
if (obj.hasOwnProperty("rulesMatched")) { | |
var rulesMatched = obj.rulesMatched; | |
for (var l = 0; l < rulesMatched.length; l++) { | |
var ruleId = rulesMatched[l]; | |
if (!seen.has(ruleId)) { | |
result.push(ruleId.substring(0, ruleId.indexOf("@"))); | |
seen.add(ruleId); | |
} | |
} | |
} | |
} | |
return result; | |
'; | |
WITH allMessages AS ( | |
SELECT | |
forMessage:json:acctId, | |
forMessage:json:date, | |
clusteringdate, | |
forMessage:json:acctEmail AS ACCTEMAIL, | |
forMessage:json:messageId AS MESSAGEID, | |
lockanalysis:analysis:lockReasons, | |
GET_DISTINCT_RULES_MATCHED(coalesce(lockanalysis:analysis:lockReasons, array_construct()), lockanalysis:analysis:lockReasons) as rulesMatched, | |
array_cat(coalesce(lockanalysis:analysis:lockReasons, array_construct()), coalesce(lockanalysis:analysis:ignoredLockReasons, array_construct())) as test | |
FROM ( | |
SELECT | |
*, | |
-- we want to look @ the latest 'lockAnalysis' event from material, which will contain the latest state on the message for a given account | |
ROW_NUMBER() OVER (PARTITION BY forMessage:json:host, forMessage:json:acctId, forMessage:json:messageId ORDER BY timestamp DESC) AS rn | |
FROM MATERIAL_STAGING.PUBLIC.TriEvent_lockAnalysis | |
) | |
WHERE rn = 1 | |
AND forMessage:json:date = :daterange | |
AND array_size(rulesmatched) > 0 | |
AND formessage:json:acctEmail not ilike 'stagingdiagnostics@gbogh.life' | |
AND formessage:json:acctEmail not ilike 'stagingdiagnostics@gbogh.site' | |
ORDER BY forMessage:json:date DESC | |
), | |
UsersAndBuckets As ( | |
SELECT | |
ACCTEMAIL, | |
MESSAGEID, | |
rulesMatched, | |
:datebucket(clusteringdate) as bucket, | |
iff(ARRAY_CONTAINS('confidentialTag'::variant, rulesmatched), 1, 0) as confidential, | |
iff(ARRAY_CONTAINS('creditCard'::variant, rulesmatched), 1, 0) as creditCard, | |
iff(ARRAY_CONTAINS('invoices'::variant, rulesmatched), 1, 0) as invoices, | |
iff(ARRAY_CONTAINS('financialReport'::variant, rulesmatched), 1, 0) as financialReport, | |
iff(ARRAY_CONTAINS('payroll'::variant, rulesmatched), 1, 0) as payroll | |
FROM allMessages | |
), | |
PeriodOverPeriodStats AS | |
( | |
SELECT | |
ACCTEMAIL, | |
-- Can change | |
bucket, | |
SUM(confidential) + SUM(invoices) + SUM(creditCard) + SUM(financialReport) + SUM(payroll) as total, | |
-- iff(DATEDIFF(month, bucket, CURRENT_DATE()) = 1, SUM(confidential) + SUM(invoices) + SUM(creditCard) + SUM(financialReport) + SUM(payroll), 0) as totalLastMonth, | |
-- iff(DATEDIFF(month, bucket, CURRENT_DATE()) = 2, SUM(confidential) + SUM(invoices) + SUM(creditCard) + SUM(financialReport) + SUM(payroll), 0) as totalTwoMonthsAgo, | |
SUM(confidential) as confidential, | |
SUM(invoices) as invoices, | |
SUM(creditCard) as creditCard, | |
SUM(financialReport) as financialReport, | |
SUM(payroll) as payroll | |
FROM USERSANDBUCKETS | |
GROUP BY ACCTEMAIL, BUCKET | |
ORDER by total desc | |
) | |
SELECT * | |
FROM ( | |
SELECT | |
*, | |
-- we want to look @ the latest 'lockAnalysis' event from material, which will contain the latest state on the message for a given account | |
ROW_NUMBER() OVER (PARTITION BY bucket ORDER BY total DESC) AS rn | |
FROM PeriodOverPeriodStats | |
) | |
WHERE rn <= 4 | |
-- SELECT | |
-- ACCTEMAIL, | |
-- bucket, | |
-- -- using a different PctWise chg here, signifying the % they've added to that which they've previously had. This should stay @ ~0% | |
-- (totalLastMonth - totalTwoMonthsAgo) / greatest(total, 1) * 100 as pctChange | |
-- FROM PERIODOVERPERIODSTATS | |
-- ORDER BY absoluteChange desc |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
CREATE OR REPLACE FUNCTION EXTRACT_RULE_ID(versionedRuleId STRING) | |
RETURNS STRING | |
LANGUAGE JAVASCRIPT | |
AS 'return versionedRuleId.substring(0, versionedRuleId.indexOf("@") - 1);'; | |
CREATE OR REPLACE FUNCTION GET_DISTINCT_RULES_MATCHED(arr1 ARRAY, arr2 ARRAY) | |
RETURNS ARRAY | |
LANGUAGE JAVASCRIPT | |
AS ' | |
var result = []; | |
var seen = new Set(); | |
for (var i = 0; i < ARR1.length; i++) { | |
var obj = ARR1[i]; | |
if (obj.hasOwnProperty("rulesMatched")) { | |
var rulesMatched = obj.rulesMatched; | |
for (var j = 0; j < rulesMatched.length; j++) { | |
var ruleId = rulesMatched[j]; | |
if (!seen.has(ruleId)) { | |
result.push(ruleId.substring(0, ruleId.indexOf("@"))); | |
seen.add(ruleId); | |
} | |
} | |
} | |
} | |
for (var k = 0; k < ARR2.length; k++) { | |
var obj = ARR2[k]; | |
if (obj.hasOwnProperty("rulesMatched")) { | |
var rulesMatched = obj.rulesMatched; | |
for (var l = 0; l < rulesMatched.length; l++) { | |
var ruleId = rulesMatched[l]; | |
if (!seen.has(ruleId)) { | |
result.push(ruleId.substring(0, ruleId.indexOf("@"))); | |
seen.add(ruleId); | |
} | |
} | |
} | |
} | |
return result; | |
'; | |
WITH allMessages AS ( | |
SELECT | |
forMessage:json:acctId, | |
forMessage:json:date, | |
clusteringdate, | |
forMessage:json:acctEmail AS ACCTEMAIL, | |
forMessage:json:messageId AS MESSAGEID, | |
lockanalysis:analysis:lockReasons, | |
GET_DISTINCT_RULES_MATCHED(coalesce(lockanalysis:analysis:lockReasons, array_construct()), lockanalysis:analysis:lockReasons) as rulesMatched, | |
array_cat(coalesce(lockanalysis:analysis:lockReasons, array_construct()), coalesce(lockanalysis:analysis:ignoredLockReasons, array_construct())) as test | |
FROM ( | |
SELECT | |
*, | |
-- we want to look @ the latest 'lockAnalysis' event from material, which will contain the latest state on the message for a given account | |
ROW_NUMBER() OVER (PARTITION BY forMessage:json:host, forMessage:json:acctId, forMessage:json:messageId ORDER BY timestamp DESC) AS rn | |
FROM MATERIAL_STAGING.PUBLIC.TriEvent_lockAnalysis | |
) | |
WHERE rn = 1 | |
AND forMessage:json:date = :daterange | |
AND array_size(rulesmatched) > 0 | |
ORDER BY forMessage:json:date DESC | |
), | |
categoriesOfInterest AS ( | |
SELECT | |
ACCTEMAIL, | |
MESSAGEID, | |
rulesMatched, | |
iff(ARRAY_CONTAINS('confidentialTag'::variant, rulesmatched), 1, 0) as confidential, | |
iff(ARRAY_CONTAINS('creditCard'::variant, rulesmatched), 1, 0) as creditCard, | |
iff(ARRAY_CONTAINS('invoices'::variant, rulesmatched), 1, 0) as invoices, | |
iff(ARRAY_CONTAINS('financialReport'::variant, rulesmatched), 1, 0) as financialReport, | |
iff(ARRAY_CONTAINS('payroll'::variant, rulesmatched), 1, 0) as payroll | |
FROM allMessages | |
) | |
SELECT | |
COUNT(*) as Count, | |
SUM(confidential) as confidential, | |
SUM(creditCard) as creditCards, | |
SUM(invoices) as invoices, | |
SUM(financialReport) as FinancialReports, | |
SUM(payroll) as payroll | |
FROM categoriesofInterest |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment