Skip to content

Instantly share code, notes, and snippets.

@benkuhn
Created June 10, 2020 12:24
Show Gist options
  • Save benkuhn/2566af1568b67f4b2d4edb8637660f48 to your computer and use it in GitHub Desktop.
Save benkuhn/2566af1568b67f4b2d4edb8637660f48 to your computer and use it in GitHub Desktop.
Athena view to enrich Cloudfront logs
CREATE OR REPLACE VIEW cloudfront_logs_plus AS
SELECT *,
replace(replace(user_agent, '%25', ' '), '%20', ' ') as user_agent_decoded,
(sc_content_type='text/html' and status = 200 and uri not like '/theme/pages/%') as is_pageview,
(referrer like '%benkuhn.net%' or referrer like '%dvmskfca82zxm.cloudfront.net%') as is_internal_referrer,
(
user_agent like 'Zapier'
or user_agent like 'SimplePie%'
or user_agent like 'Feedbin%'
or user_agent like 'Fever/%'
or user_agent like 'Blogtrottr%'
or user_agent like 'Feedly/%'
or user_agent like 'RSSMix/%'
or user_agent like 'Ruby%'
or user_agent like 'Tiny%2520Tiny%2520RSS%'
or user_agent like 'Feed%2520Wrangler%'
or user_agent like 'Digg%2520Feed%2520Fetcher%'
or user_agent like 'kouio.com%'
or user_agent like 'RSS%2520Bot%'
or user_agent like 'UniversalFeedParser%'
or user_agent like 'facebookexternalhit%'
or user_agent like 'BUbiNG%'
or user_agent like 'RSSOwl%'
or user_agent like 'node-superagent%'
or user_agent like 'com.apple.Safari.WebFeedParser%'
or user_agent like 'NewsBlur%2520Feed%2520Fetcher%'
or user_agent like '%QuiteRSS%'
or user_agent like '%Googlebot%'
or user_agent like '%DotBot%'
or user_agent like '%bingbot%'
or user_agent like '%feeder.co%'
or user_agent like '%YandexBot/%'
or user_agent like '%BLEXBot/%'
or user_agent like '%Baiduspider/%'
or user_agent like '%AhrefsBot/%'
or user_agent like '%inoreader.com%'
or user_agent like '%SeznamBot%'
or user_agent like '%Applebot%'
or user_agent like '%SemrushBot%'
or user_agent like '%BazQux/%'
or user_agent like 'Slack-ImgProxy%'
or user_agent like 'Buck/%'
or user_agent like '%resyndicator%'
or user_agent like 'Go%package%http'
or user_agent like '%spbot/%'
or user_agent like '%Qwantify/%'
or user_agent like '%MJ12bot/%'
or user_agent like 'Sogou%'
or user_agent like 'FeedlyBot/%'
or user_agent like '%Elfeed%'
or user_agent like 'FeedHQ/%'
or user_agent like 'MauiBot%'
or user_agent like 'Twitterbot/%'
or user_agent like 'Dreamwidth%'
or user_agent like '%linkdexbot%'
or user_agent in ('micro-rss-parser', 'Ruby', 'curl', '-', 'Rome', 'The%2520Knowledge%2520AI')
) as is_robot
from cloudfront_logs;
-- Note: this double counts newsblur
CREATE OR REPLACE VIEW rss_subscribers AS
WITH subscriber_numbers AS (
SELECT cast(regexp_extract(user_agent_decoded,
'(\d+) (subscriber|reader)s?', 1) AS integer) AS subscribers,
regexp_replace(regexp_replace(user_agent_decoded, '\(Mozilla/.*$', ''), '(\d+) (subscriber|reader)s?', '') AS user_agent
FROM cloudfront_logs_plus
WHERE user_agent_decoded LIKE '%subscriber%' OR user_agent_decoded LIKE '%reader%'
)
SELECT max(subscribers) AS subscribers,
user_agent
FROM subscriber_numbers
GROUP BY 2
ORDER BY 1 desc;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment