Skip to content

Instantly share code, notes, and snippets.

@FUSAKLA
Last active November 7, 2022 11:05
Show Gist options
  • Save FUSAKLA/3f322d478839cabef7ace64a4be9177a to your computer and use it in GitHub Desktop.
Save FUSAKLA/3f322d478839cabef7ace64a4be9177a to your computer and use it in GitHub Desktop.
PostHog data retention cleanup job
#!/usr/bin/python
import logging
import os
from datetime import datetime, timedelta
import django
django.setup()
from posthog.models import Event, ElementGroup
from django.utils import timezone
max_age_days = int(os.getenv("POSTHOG_CLEANUP_OLDER_THAN_DAYS", 30))
step_size = int(os.getenv("POSTHOG_CLEANUP_BATCH_SIZE", 1000))
dry_run = True if os.getenv("POSTHOG_CLEANUP_DRY_RUN", "False").lower() in ["true", "yes", "1"] else False
logging.basicConfig(level=logging.INFO, format='%(asctime)s %(message)s', datefmt='%m/%d/%Y %H:%M:%S ')
def get_events_older_than(older_than):
return Event.objects.filter(timestamp__lt=timezone.make_aware(datetime.now() - timedelta(older_than))).values_list(
'pk', flat=True)
def get_non_referenced_event_groups():
event_group_hashes = Event.objects.all().values_list('elements_hash', flat=True)
return ElementGroup.objects.exclude(hash__in=list(event_group_hashes))
def delete_items(item_type, items):
if dry_run:
logging.info("Skipping delete of items in dry run mode...")
return
item_type.objects.filter(id__in=list(items)).delete()
def delete_items_batched(item_type, items, logging_indent=6 * " "):
number_of_items = len(items)
logging.info("%sDeleting %d items of type %s using batches of %d size:", logging_indent, number_of_items,
item_type.__name__, step_size)
last_id = 0
while last_id + step_size <= number_of_items:
delete_items(item_type, items[last_id:last_id + step_size])
logging.info("%s %d%%", logging_indent, int(last_id / number_of_items * 100))
last_id += step_size
delete_items(item_type, items[last_id:])
logging.info("%s 100%%", logging_indent)
if __name__ == "__main__":
logging.info("Running cleanup of PostHog...")
start_time = datetime.now()
logging.info(" - Deleting all events older than %d days:", max_age_days)
delete_items_batched(Event, get_events_older_than(max_age_days))
logging.info(" - Deleting all elements and element groups not referenced by any event anymore:")
delete_items_batched(ElementGroup, get_non_referenced_event_groups())
logging.info("Cleanup finished, total duration: %s", datetime.now() - start_time)
@tj-lmg
Copy link

tj-lmg commented May 10, 2021

from posthog.models import Event, ElementGroup is throwing error as there is no models module in posthog library - can you please confirm which version of posthog library are you using?

@FUSAKLA
Copy link
Author

FUSAKLA commented Jul 7, 2021

Hi, the snippet is actually quite old, so definitely there might be some compatibility issues.
I wrote it in Sep 3, 2020 for the current release at that time, not sure which exactly it was at that time.

But I have it still running with latest 1.26.0 version and seems to be working just fine 🤔

It might be an issue with imports, try running it as python posthog_cleanup.py in the directory where the manage.py is of the app.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment