Skip to content

Instantly share code, notes, and snippets.

@jwhitlock
Last active January 10, 2017 19:10
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jwhitlock/f636a8d9424f3f344f0f1b6cdbb95ae7 to your computer and use it in GitHub Desktop.
Save jwhitlock/f636a8d9424f3f344f0f1b6cdbb95ae7 to your computer and use it in GitHub Desktop.
Remove unused tags, and merge "duplicate" tags
# Designed to be pasted in a Django shell session
# Bug 1293749 - Add missing UNIQUE KEY indexes in production
# Some tags are duplicates according to collation, such as "Tag" and "tag"
# Prefer the tag with the most usage, falling back to smallest ID
# Drop unused tags, to simplify tag administration
tag_relations = (
(Tag, 'taggit_taggeditem_items', 'tags'),
(DocumentTag, 'wiki_taggeddocument_items', 'tags'),
(LocalizationTag, 'wiki_localizationtaggedrevision_items', 'localization_tags'),
(ReviewTag, 'wiki_reviewtaggedrevision_items', 'review_tags'),
)
def scan_tags(model, relation_name):
model_name = model._meta.object_name
counts = {}
dupe_ids = {}
empty_tag_ids = []
max_id = 0
# Count items for all tags, find max ID, find duplicates, maybe delete unused
for tag in model.objects.all():
max_id = max(max_id, tag.id)
rel_count = getattr(tag, relation_name).count()
counts[tag.id] = rel_count
if not rel_count:
empty_tag_ids.append(tag.id)
dupe_tags = model.objects.filter(name=tag.name)
if dupe_tags.count() > 1:
dupe_ids[tag.id] = list(dupe_tags.values_list('id', flat=True))
# Pick replacements tags for duplicates
max_id += 1
replace_tag_ids = {}
for orig_id, duplicate_ids in dupe_ids.items():
if counts[orig_id]:
# Winner has most relations, or lowest ID
winner = max([(counts[tag_id], max_id - tag_id, tag_id) for tag_id in duplicate_ids])[2]
if winner != orig_id:
replace_tag_ids[orig_id] = winner
return empty_tag_ids, replace_tag_ids
def drop_empty_tags(model, empty_tag_ids, dry_run):
model_name = model._meta.object_name
for tag_id in empty_tag_ids:
tag = model.objects.get(id=tag_id)
print('Dropping %s "%s" with no related items' % (model_name, tag.name.encode('utf8')))
if not dry_run:
tag.delete()
def move_tags(model, relation_name, reverse_relation_name, replace_tag_ids, dry_run):
for orig_id, new_id in replace_tag_ids.items():
orig_tag = model.objects.get(id=orig_id)
new_tag = model.objects.get(id=new_id)
orig_name = orig_tag.name.encode('utf8')
new_name = new_tag.name.encode('utf8')
print('Replace "%s" with "%s"' % (orig_name, new_name))
orig_relations = list(getattr(orig_tag, relation_name).all())
for orig_rel in orig_relations:
obj = orig_rel.content_object
tags = getattr(obj, reverse_relation_name)
print(' Removing "%s" from tags for %s' % (orig_name, obj))
if not dry_run:
tags.remove(orig_tag)
print(' Adding "%s" to tags for %s' % (new_name, obj))
if not dry_run:
tags.add(new_tag)
print(' Deleting "%s"' % (orig_name))
if not dry_run:
orig_tag.delete()
def process_tags(tag_relations, dry_run=True):
for model, relation_name, reverse_relation_name in tag_relations:
empty_tag_ids, replace_tag_ids = scan_tags(model, relation_name)
drop_empty_tags(model, empty_tag_ids, dry_run)
move_tags(model, relation_name, reverse_relation_name, replace_tag_ids, dry_run)
process_tags(tag_relations)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment