Skip to content

Instantly share code, notes, and snippets.

@maxpollard
Created August 4, 2023 22:05
Show Gist options
  • Save maxpollard/fafdbd6460b125d7582b5f595ef97d59 to your computer and use it in GitHub Desktop.
Save maxpollard/fafdbd6460b125d7582b5f595ef97d59 to your computer and use it in GitHub Desktop.
Sensitive Content queries using Snowflake + Material
CREATE OR REPLACE FUNCTION EXTRACT_RULE_ID(versionedRuleId STRING)
RETURNS STRING
LANGUAGE JAVASCRIPT
AS 'return versionedRuleId.substring(0, versionedRuleId.indexOf("@") - 1);';
CREATE OR REPLACE FUNCTION GET_DISTINCT_RULES_MATCHED(arr1 ARRAY, arr2 ARRAY)
RETURNS ARRAY
LANGUAGE JAVASCRIPT
AS '
var result = [];
var seen = new Set();
for (var i = 0; i < ARR1.length; i++) {
var obj = ARR1[i];
if (obj.hasOwnProperty("rulesMatched")) {
var rulesMatched = obj.rulesMatched;
for (var j = 0; j < rulesMatched.length; j++) {
var ruleId = rulesMatched[j];
if (!seen.has(ruleId)) {
result.push(ruleId.substring(0, ruleId.indexOf("@")));
seen.add(ruleId);
}
}
}
}
for (var k = 0; k < ARR2.length; k++) {
var obj = ARR2[k];
if (obj.hasOwnProperty("rulesMatched")) {
var rulesMatched = obj.rulesMatched;
for (var l = 0; l < rulesMatched.length; l++) {
var ruleId = rulesMatched[l];
if (!seen.has(ruleId)) {
result.push(ruleId.substring(0, ruleId.indexOf("@")));
seen.add(ruleId);
}
}
}
}
return result;
';
WITH allMessages AS (
SELECT
forMessage:json:acctId,
forMessage:json:date,
clusteringdate,
forMessage:json:acctEmail AS ACCTEMAIL,
forMessage:json:messageId AS MESSAGEID,
lockanalysis:analysis:lockReasons,
GET_DISTINCT_RULES_MATCHED(coalesce(lockanalysis:analysis:lockReasons, array_construct()), lockanalysis:analysis:lockReasons) as rulesMatched,
array_cat(coalesce(lockanalysis:analysis:lockReasons, array_construct()), coalesce(lockanalysis:analysis:ignoredLockReasons, array_construct())) as test
FROM (
SELECT
*,
-- we want to look @ the latest 'lockAnalysis' event from material, which will contain the latest state on the message for a given account
ROW_NUMBER() OVER (PARTITION BY forMessage:json:host, forMessage:json:acctId, forMessage:json:messageId ORDER BY timestamp DESC) AS rn
FROM MATERIAL_STAGING.PUBLIC.TriEvent_lockAnalysis
)
WHERE rn = 1
AND forMessage:json:date = :daterange
AND array_size(rulesmatched) > 0
AND formessage:json:acctEmail not ilike 'stagingdiagnostics@gbogh.life'
AND formessage:json:acctEmail not ilike 'stagingdiagnostics@gbogh.site'
ORDER BY forMessage:json:date DESC
),
UsersAndBuckets As (
SELECT
ACCTEMAIL,
MESSAGEID,
rulesMatched,
:datebucket(clusteringdate) as bucket,
iff(ARRAY_CONTAINS('confidentialTag'::variant, rulesmatched), 1, 0) as confidential,
iff(ARRAY_CONTAINS('creditCard'::variant, rulesmatched), 1, 0) as creditCard,
iff(ARRAY_CONTAINS('invoices'::variant, rulesmatched), 1, 0) as invoices,
iff(ARRAY_CONTAINS('financialReport'::variant, rulesmatched), 1, 0) as financialReport,
iff(ARRAY_CONTAINS('payroll'::variant, rulesmatched), 1, 0) as payroll
FROM allMessages
),
PeriodOverPeriodStats AS
(
SELECT
ACCTEMAIL,
-- Can change
bucket,
SUM(confidential) + SUM(invoices) + SUM(creditCard) + SUM(financialReport) + SUM(payroll) as total,
-- iff(DATEDIFF(month, bucket, CURRENT_DATE()) = 1, SUM(confidential) + SUM(invoices) + SUM(creditCard) + SUM(financialReport) + SUM(payroll), 0) as totalLastMonth,
-- iff(DATEDIFF(month, bucket, CURRENT_DATE()) = 2, SUM(confidential) + SUM(invoices) + SUM(creditCard) + SUM(financialReport) + SUM(payroll), 0) as totalTwoMonthsAgo,
SUM(confidential) as confidential,
SUM(invoices) as invoices,
SUM(creditCard) as creditCard,
SUM(financialReport) as financialReport,
SUM(payroll) as payroll
FROM USERSANDBUCKETS
GROUP BY ACCTEMAIL, BUCKET
ORDER by total desc
)
SELECT *
FROM (
SELECT
*,
-- we want to look @ the latest 'lockAnalysis' event from material, which will contain the latest state on the message for a given account
ROW_NUMBER() OVER (PARTITION BY bucket ORDER BY total DESC) AS rn
FROM PeriodOverPeriodStats
)
WHERE rn <= 4
-- SELECT
-- ACCTEMAIL,
-- bucket,
-- -- using a different PctWise chg here, signifying the % they've added to that which they've previously had. This should stay @ ~0%
-- (totalLastMonth - totalTwoMonthsAgo) / greatest(total, 1) * 100 as pctChange
-- FROM PERIODOVERPERIODSTATS
-- ORDER BY absoluteChange desc
CREATE OR REPLACE FUNCTION EXTRACT_RULE_ID(versionedRuleId STRING)
RETURNS STRING
LANGUAGE JAVASCRIPT
AS 'return versionedRuleId.substring(0, versionedRuleId.indexOf("@") - 1);';
CREATE OR REPLACE FUNCTION GET_DISTINCT_RULES_MATCHED(arr1 ARRAY, arr2 ARRAY)
RETURNS ARRAY
LANGUAGE JAVASCRIPT
AS '
var result = [];
var seen = new Set();
for (var i = 0; i < ARR1.length; i++) {
var obj = ARR1[i];
if (obj.hasOwnProperty("rulesMatched")) {
var rulesMatched = obj.rulesMatched;
for (var j = 0; j < rulesMatched.length; j++) {
var ruleId = rulesMatched[j];
if (!seen.has(ruleId)) {
result.push(ruleId.substring(0, ruleId.indexOf("@")));
seen.add(ruleId);
}
}
}
}
for (var k = 0; k < ARR2.length; k++) {
var obj = ARR2[k];
if (obj.hasOwnProperty("rulesMatched")) {
var rulesMatched = obj.rulesMatched;
for (var l = 0; l < rulesMatched.length; l++) {
var ruleId = rulesMatched[l];
if (!seen.has(ruleId)) {
result.push(ruleId.substring(0, ruleId.indexOf("@")));
seen.add(ruleId);
}
}
}
}
return result;
';
WITH allMessages AS (
SELECT
forMessage:json:acctId,
forMessage:json:date,
clusteringdate,
forMessage:json:acctEmail AS ACCTEMAIL,
forMessage:json:messageId AS MESSAGEID,
lockanalysis:analysis:lockReasons,
GET_DISTINCT_RULES_MATCHED(coalesce(lockanalysis:analysis:lockReasons, array_construct()), lockanalysis:analysis:lockReasons) as rulesMatched,
array_cat(coalesce(lockanalysis:analysis:lockReasons, array_construct()), coalesce(lockanalysis:analysis:ignoredLockReasons, array_construct())) as test
FROM (
SELECT
*,
-- we want to look @ the latest 'lockAnalysis' event from material, which will contain the latest state on the message for a given account
ROW_NUMBER() OVER (PARTITION BY forMessage:json:host, forMessage:json:acctId, forMessage:json:messageId ORDER BY timestamp DESC) AS rn
FROM MATERIAL_STAGING.PUBLIC.TriEvent_lockAnalysis
)
WHERE rn = 1
AND forMessage:json:date = :daterange
AND array_size(rulesmatched) > 0
ORDER BY forMessage:json:date DESC
),
categoriesOfInterest AS (
SELECT
ACCTEMAIL,
MESSAGEID,
rulesMatched,
iff(ARRAY_CONTAINS('confidentialTag'::variant, rulesmatched), 1, 0) as confidential,
iff(ARRAY_CONTAINS('creditCard'::variant, rulesmatched), 1, 0) as creditCard,
iff(ARRAY_CONTAINS('invoices'::variant, rulesmatched), 1, 0) as invoices,
iff(ARRAY_CONTAINS('financialReport'::variant, rulesmatched), 1, 0) as financialReport,
iff(ARRAY_CONTAINS('payroll'::variant, rulesmatched), 1, 0) as payroll
FROM allMessages
)
SELECT
COUNT(*) as Count,
SUM(confidential) as confidential,
SUM(creditCard) as creditCards,
SUM(invoices) as invoices,
SUM(financialReport) as FinancialReports,
SUM(payroll) as payroll
FROM categoriesofInterest
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment