Skip to content

Instantly share code, notes, and snippets.

@thinkwelltwd
Created July 11, 2024 12:53
Show Gist options
  • Save thinkwelltwd/0ccb7e3cc748984bb1912d7a624a91f0 to your computer and use it in GitHub Desktop.
Save thinkwelltwd/0ccb7e3cc748984bb1912d7a624a91f0 to your computer and use it in GitHub Desktop.
Extend Redwood Classification of YouTube with Starlark
"""
This is an example Starlark script to extend Redwood's classification
of YoutTube videos, and also track time quotas.
It will not work at is - for inspiration purposes only!
"""
VIDEO_ID_PATTERN = "(?:v=|embed/|youtu.be/)([a-zA-Z0-9_-]{11})"
SKIP_PATHS = "(error|generate)_204|ptracking|pagead/|/(api/stats|log_event)"
YT_HOSTS = (
'youtube.com',
'm.youtube.com',
'www.youtube.com',
'youtube-nocookie.com',
'www.youtube-nocookie.com',
'youtu.be',
'www.youtu.be',
)
ASSET_EXTENSIONS = (
'.css',
'.js',
)
NOT_CLASSIFIED = 'N/A'
QUOTA_LIMITED_CATEGORIES = (
'cat_videos',
'sports',
)
def filter_request(request):
"""
Check for Media request and send the Media IDs
to an API for further evaluation. Only
supporting YouTube URLs at this point.
"""
if not request.path:
return
# If the request is already being blocked, there's
# no need to check for time quotas
if request.action == 'block':
return
if (
not request.host.startswith(YT_HOSTS)
or request.path.endswith(ASSET_EXTENSIONS)
or re.findall(SKIP_PATHS, request.path)
):
return
video_id = get_video_id(request)
if not video_id:
return
category = classify_video_id(video_id)
if banned_time_quota(request, category):
request.scores['some_blocking_category'] = 1500
def banned_time_quota(request, category):
"""
See if the user has time quota remaining for this category.
"""
if category not in QUOTA_LIMITED_CATEGORIES:
return False
data = {
'user': request.user,
'user_agent': request.user_agent,
'category': category,
}
resp = http.post('http://cachingservice.com/', json_body=data)
if resp.json()['action'] == 'expired':
return True
return False
def get_video_id(request):
"""
Get Video ID, checking either the request URL
or the referer URL
"""
query_param = request.query.get('v')
if query_param:
return query_param
referer = request.header.get('Referer')
if not referer:
return ''
media_id = re.findall(VIDEO_ID_PATTERN, referer)
if not media_id:
return ''
video_id = media_id[0]
# the regex engine doesn't support groups :-|
if video_id.startswith('v='):
return video_id[2:]
if video_id.startswith('embed/'):
return video_id[6:]
return video_id
def classify_video_id(request, video_id):
"""
Get classifier category, either from Redwood's categories,
or from an external API classification service.
"""
# Check for local Redwood Categories or ACLs first.
for category in QUOTA_LIMITED_CATEGORIES:
if category in request.acls or category in request.scores:
return category
# An external API call could be made to some
# service that classifies YT videos.
video_classification_cache = Cache('video_classification_cache', 2048)
details_from_cache = video_classification_cache.get(video_id)
if details_from_cache:
return details_from_cache
url = 'http://externalclassifier.com/%s/' % video_id
resp = http.get(url)
if resp.status_code == 200:
category = resp.json().get('category') or NOT_CLASSIFIED
video_classification_cache.set(video_id, category)
return category
print('An error occurred on classifier URL: %s - status: %s' % (url, resp.status_code))
return NOT_CLASSIFIED
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment