Skip to content

Instantly share code, notes, and snippets.

@buchi
Last active June 22, 2017 07:37
Show Gist options
  • Save buchi/2c9da7122f32bac138a0c1bce086d63b to your computer and use it in GitHub Desktop.
Save buchi/2c9da7122f32bac138a0c1bce086d63b to your computer and use it in GitHub Desktop.
Check Plone indexes for Unicode values
# Checks all UnIndex-based indexes for Unicode strings.
# Indexes should contain only byte strings.
# Having Unicode strings in indexes may result in UnicodeDecodeErrors during indexing or searching.
from Products.CMFPlone.interfaces import IPloneSiteRoot
from Products.CMFCore.utils import getToolByName
def get_plone_sites(root):
result = []
for obj in root.values():
if obj.meta_type is 'Folder':
result = result + get_plone_sites(obj)
elif IPloneSiteRoot.providedBy(obj):
result.append(obj)
elif obj.getId() in getattr(root, '_mount_points', {}):
result.extend(get_plone_sites(obj))
return result
sites = get_plone_sites(app)
for site in sites:
print "Examining Plone site '%s' ..." % '/'.join(site.getPhysicalPath())
ct = getToolByName(site, 'portal_catalog')
bad_indexes = set()
for name, idx in ct._catalog.indexes.items():
if idx.meta_type in ['KeywordIndex', 'FieldIndex']:
print "Examining index '%s' ..." % name
bad_values = set()
bad_doc_ids = set()
for datum, doc_ids in idx._index.items():
if isinstance(datum, unicode):
bad_doc_ids |= set(doc_ids)
if datum not in bad_values:
bad_values.add(datum)
bad_indexes.add(name)
print "*** Forward index '%s' contains Unicode value: '%s'. Document Ids: %s" % (name, datum, list(doc_ids))
bad_values = set()
for doc_id, datum in idx._unindex.items():
if isinstance(datum, unicode):
bad_doc_ids.add(doc_id)
if datum not in bad_values:
bad_values.add(datum)
bad_indexes.add(name)
print "*** Inverted index '%s' contains Unicode value: '%s'. Document Ids: %s" % (name, datum, list(doc_ids))
print "***** Bad indexes: %s" % list(bad_indexes)
print "Done."
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment