Athena view to enrich Cloudfront logs
CREATE OR REPLACE VIEW cloudfront_logs_plus AS | |
SELECT *, | |
replace(replace(user_agent, '%25', ' '), '%20', ' ') as user_agent_decoded, | |
(sc_content_type='text/html' and status = 200 and uri not like '/theme/pages/%') as is_pageview, | |
(referrer like '%benkuhn.net%' or referrer like '%dvmskfca82zxm.cloudfront.net%') as is_internal_referrer, | |
( | |
user_agent like 'Zapier' | |
or user_agent like 'SimplePie%' | |
or user_agent like 'Feedbin%' | |
or user_agent like 'Fever/%' | |
or user_agent like 'Blogtrottr%' | |
or user_agent like 'Feedly/%' | |
or user_agent like 'RSSMix/%' | |
or user_agent like 'Ruby%' | |
or user_agent like 'Tiny%2520Tiny%2520RSS%' | |
or user_agent like 'Feed%2520Wrangler%' | |
or user_agent like 'Digg%2520Feed%2520Fetcher%' | |
or user_agent like 'kouio.com%' | |
or user_agent like 'RSS%2520Bot%' | |
or user_agent like 'UniversalFeedParser%' | |
or user_agent like 'facebookexternalhit%' | |
or user_agent like 'BUbiNG%' | |
or user_agent like 'RSSOwl%' | |
or user_agent like 'node-superagent%' | |
or user_agent like 'com.apple.Safari.WebFeedParser%' | |
or user_agent like 'NewsBlur%2520Feed%2520Fetcher%' | |
or user_agent like '%QuiteRSS%' | |
or user_agent like '%Googlebot%' | |
or user_agent like '%DotBot%' | |
or user_agent like '%bingbot%' | |
or user_agent like '%feeder.co%' | |
or user_agent like '%YandexBot/%' | |
or user_agent like '%BLEXBot/%' | |
or user_agent like '%Baiduspider/%' | |
or user_agent like '%AhrefsBot/%' | |
or user_agent like '%inoreader.com%' | |
or user_agent like '%SeznamBot%' | |
or user_agent like '%Applebot%' | |
or user_agent like '%SemrushBot%' | |
or user_agent like '%BazQux/%' | |
or user_agent like 'Slack-ImgProxy%' | |
or user_agent like 'Buck/%' | |
or user_agent like '%resyndicator%' | |
or user_agent like 'Go%package%http' | |
or user_agent like '%spbot/%' | |
or user_agent like '%Qwantify/%' | |
or user_agent like '%MJ12bot/%' | |
or user_agent like 'Sogou%' | |
or user_agent like 'FeedlyBot/%' | |
or user_agent like '%Elfeed%' | |
or user_agent like 'FeedHQ/%' | |
or user_agent like 'MauiBot%' | |
or user_agent like 'Twitterbot/%' | |
or user_agent like 'Dreamwidth%' | |
or user_agent like '%linkdexbot%' | |
or user_agent in ('micro-rss-parser', 'Ruby', 'curl', '-', 'Rome', 'The%2520Knowledge%2520AI') | |
) as is_robot | |
from cloudfront_logs; | |
-- Note: this double counts newsblur | |
CREATE OR REPLACE VIEW rss_subscribers AS | |
WITH subscriber_numbers AS ( | |
SELECT cast(regexp_extract(user_agent_decoded, | |
'(\d+) (subscriber|reader)s?', 1) AS integer) AS subscribers, | |
regexp_replace(regexp_replace(user_agent_decoded, '\(Mozilla/.*$', ''), '(\d+) (subscriber|reader)s?', '') AS user_agent | |
FROM cloudfront_logs_plus | |
WHERE user_agent_decoded LIKE '%subscriber%' OR user_agent_decoded LIKE '%reader%' | |
) | |
SELECT max(subscribers) AS subscribers, | |
user_agent | |
FROM subscriber_numbers | |
GROUP BY 2 | |
ORDER BY 1 desc; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment