Created
June 10, 2020 12:24
-
-
Save benkuhn/2566af1568b67f4b2d4edb8637660f48 to your computer and use it in GitHub Desktop.
Athena view to enrich Cloudfront logs
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
CREATE OR REPLACE VIEW cloudfront_logs_plus AS | |
SELECT *, | |
replace(replace(user_agent, '%25', ' '), '%20', ' ') as user_agent_decoded, | |
(sc_content_type='text/html' and status = 200 and uri not like '/theme/pages/%') as is_pageview, | |
(referrer like '%benkuhn.net%' or referrer like '%dvmskfca82zxm.cloudfront.net%') as is_internal_referrer, | |
( | |
user_agent like 'Zapier' | |
or user_agent like 'SimplePie%' | |
or user_agent like 'Feedbin%' | |
or user_agent like 'Fever/%' | |
or user_agent like 'Blogtrottr%' | |
or user_agent like 'Feedly/%' | |
or user_agent like 'RSSMix/%' | |
or user_agent like 'Ruby%' | |
or user_agent like 'Tiny%2520Tiny%2520RSS%' | |
or user_agent like 'Feed%2520Wrangler%' | |
or user_agent like 'Digg%2520Feed%2520Fetcher%' | |
or user_agent like 'kouio.com%' | |
or user_agent like 'RSS%2520Bot%' | |
or user_agent like 'UniversalFeedParser%' | |
or user_agent like 'facebookexternalhit%' | |
or user_agent like 'BUbiNG%' | |
or user_agent like 'RSSOwl%' | |
or user_agent like 'node-superagent%' | |
or user_agent like 'com.apple.Safari.WebFeedParser%' | |
or user_agent like 'NewsBlur%2520Feed%2520Fetcher%' | |
or user_agent like '%QuiteRSS%' | |
or user_agent like '%Googlebot%' | |
or user_agent like '%DotBot%' | |
or user_agent like '%bingbot%' | |
or user_agent like '%feeder.co%' | |
or user_agent like '%YandexBot/%' | |
or user_agent like '%BLEXBot/%' | |
or user_agent like '%Baiduspider/%' | |
or user_agent like '%AhrefsBot/%' | |
or user_agent like '%inoreader.com%' | |
or user_agent like '%SeznamBot%' | |
or user_agent like '%Applebot%' | |
or user_agent like '%SemrushBot%' | |
or user_agent like '%BazQux/%' | |
or user_agent like 'Slack-ImgProxy%' | |
or user_agent like 'Buck/%' | |
or user_agent like '%resyndicator%' | |
or user_agent like 'Go%package%http' | |
or user_agent like '%spbot/%' | |
or user_agent like '%Qwantify/%' | |
or user_agent like '%MJ12bot/%' | |
or user_agent like 'Sogou%' | |
or user_agent like 'FeedlyBot/%' | |
or user_agent like '%Elfeed%' | |
or user_agent like 'FeedHQ/%' | |
or user_agent like 'MauiBot%' | |
or user_agent like 'Twitterbot/%' | |
or user_agent like 'Dreamwidth%' | |
or user_agent like '%linkdexbot%' | |
or user_agent in ('micro-rss-parser', 'Ruby', 'curl', '-', 'Rome', 'The%2520Knowledge%2520AI') | |
) as is_robot | |
from cloudfront_logs; | |
-- Note: this double counts newsblur | |
CREATE OR REPLACE VIEW rss_subscribers AS | |
WITH subscriber_numbers AS ( | |
SELECT cast(regexp_extract(user_agent_decoded, | |
'(\d+) (subscriber|reader)s?', 1) AS integer) AS subscribers, | |
regexp_replace(regexp_replace(user_agent_decoded, '\(Mozilla/.*$', ''), '(\d+) (subscriber|reader)s?', '') AS user_agent | |
FROM cloudfront_logs_plus | |
WHERE user_agent_decoded LIKE '%subscriber%' OR user_agent_decoded LIKE '%reader%' | |
) | |
SELECT max(subscribers) AS subscribers, | |
user_agent | |
FROM subscriber_numbers | |
GROUP BY 2 | |
ORDER BY 1 desc; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment