Created
July 11, 2024 12:53
-
-
Save thinkwelltwd/0ccb7e3cc748984bb1912d7a624a91f0 to your computer and use it in GitHub Desktop.
Extend Redwood Classification of YouTube with Starlark
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
This is an example Starlark script to extend Redwood's classification | |
of YoutTube videos, and also track time quotas. | |
It will not work at is - for inspiration purposes only! | |
""" | |
VIDEO_ID_PATTERN = "(?:v=|embed/|youtu.be/)([a-zA-Z0-9_-]{11})" | |
SKIP_PATHS = "(error|generate)_204|ptracking|pagead/|/(api/stats|log_event)" | |
YT_HOSTS = ( | |
'youtube.com', | |
'm.youtube.com', | |
'www.youtube.com', | |
'youtube-nocookie.com', | |
'www.youtube-nocookie.com', | |
'youtu.be', | |
'www.youtu.be', | |
) | |
ASSET_EXTENSIONS = ( | |
'.css', | |
'.js', | |
) | |
NOT_CLASSIFIED = 'N/A' | |
QUOTA_LIMITED_CATEGORIES = ( | |
'cat_videos', | |
'sports', | |
) | |
def filter_request(request): | |
""" | |
Check for Media request and send the Media IDs | |
to an API for further evaluation. Only | |
supporting YouTube URLs at this point. | |
""" | |
if not request.path: | |
return | |
# If the request is already being blocked, there's | |
# no need to check for time quotas | |
if request.action == 'block': | |
return | |
if ( | |
not request.host.startswith(YT_HOSTS) | |
or request.path.endswith(ASSET_EXTENSIONS) | |
or re.findall(SKIP_PATHS, request.path) | |
): | |
return | |
video_id = get_video_id(request) | |
if not video_id: | |
return | |
category = classify_video_id(video_id) | |
if banned_time_quota(request, category): | |
request.scores['some_blocking_category'] = 1500 | |
def banned_time_quota(request, category): | |
""" | |
See if the user has time quota remaining for this category. | |
""" | |
if category not in QUOTA_LIMITED_CATEGORIES: | |
return False | |
data = { | |
'user': request.user, | |
'user_agent': request.user_agent, | |
'category': category, | |
} | |
resp = http.post('http://cachingservice.com/', json_body=data) | |
if resp.json()['action'] == 'expired': | |
return True | |
return False | |
def get_video_id(request): | |
""" | |
Get Video ID, checking either the request URL | |
or the referer URL | |
""" | |
query_param = request.query.get('v') | |
if query_param: | |
return query_param | |
referer = request.header.get('Referer') | |
if not referer: | |
return '' | |
media_id = re.findall(VIDEO_ID_PATTERN, referer) | |
if not media_id: | |
return '' | |
video_id = media_id[0] | |
# the regex engine doesn't support groups :-| | |
if video_id.startswith('v='): | |
return video_id[2:] | |
if video_id.startswith('embed/'): | |
return video_id[6:] | |
return video_id | |
def classify_video_id(request, video_id): | |
""" | |
Get classifier category, either from Redwood's categories, | |
or from an external API classification service. | |
""" | |
# Check for local Redwood Categories or ACLs first. | |
for category in QUOTA_LIMITED_CATEGORIES: | |
if category in request.acls or category in request.scores: | |
return category | |
# An external API call could be made to some | |
# service that classifies YT videos. | |
video_classification_cache = Cache('video_classification_cache', 2048) | |
details_from_cache = video_classification_cache.get(video_id) | |
if details_from_cache: | |
return details_from_cache | |
url = 'http://externalclassifier.com/%s/' % video_id | |
resp = http.get(url) | |
if resp.status_code == 200: | |
category = resp.json().get('category') or NOT_CLASSIFIED | |
video_classification_cache.set(video_id, category) | |
return category | |
print('An error occurred on classifier URL: %s - status: %s' % (url, resp.status_code)) | |
return NOT_CLASSIFIED |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment