escattone/re_render_docs.py

## re_render_docs.py
# From https://gist.github.com/jwhitlock/43e34e07bef8c3f1863e91f076778ca6
from time import sleep, time

import redis
from celery.states import READY_STATES
from django.conf import settings

from kuma.wiki.models import Document
from kuma.wiki.tasks import render_document


def null_notify_rerender_chunk(event, doc_id, task):
    """Throw away render events."""
    pass


doc_urls = dict()


def verbose_notify_rerender_chunk(event, doc_id, task):
    """Print render events."""
    global doc_urls
    if doc_id not in doc_urls:
        doc = Document.objects.get(id=doc_id)
        doc_urls[doc_id] = doc.get_full_url()
    doc_url = doc_urls[doc_id]
    print("Render %s (%s): %d %s" % (event, task.state, doc_id, doc_url))


def rerender_chunk(doc_ids, stuck_time=120, notifier_func=None):
    """
    Queue a set of documents to re-render, and wait until they are done.

    Keyword Arguments:
    doc_ids - A sequence of document IDs to re-render
    stuck_time (120) - The time to wait for the last re-render to complete.
    notifier_func (None) - A function to call when a document event occurs.

    Return is a tuple of counts (documents rendered, documents unrendered)
    """
    if not notifier_func:
        notifier_func = null_notify_rerender_chunk
    tasks = []
    total = len(doc_ids)
    for doc_id in doc_ids:
        task = render_document.delay(doc_id, "no-cache", None, force=True, invalidate_cdn_cache=False)
        # notifier_func('start', doc_id, task)
        tasks.append((doc_id, task, task.state, False))
    in_progress = len(doc_ids)
    stuck = 0
    while in_progress:
        last_in_progress = in_progress
        in_progress = 0
        next_tasks = []
        for doc_id, task, state, done in tasks:
            if not done:
                state = task.state
                if state in READY_STATES:
                    done = True
                    notifier_func('done', doc_id, task)
                else:
                    in_progress += 1
            next_tasks.append((doc_id, task, state, done))
        tasks = next_tasks
        if last_in_progress == in_progress:
            stuck += 1
        else:
            stuck = 0
        if stuck >= stuck_time:
            for doc_id, task, state, done in tasks:
                if not done:
                    notifier_func('stuck', doc_id, task)
            return (total - in_progress, in_progress)
        if in_progress:
            sleep(1)
    return total, 0


def purgable_count():
    """Return the number of tasks in the purgable queue."""
    if settings.CELERY_BROKER_URL.startswith('redis://'):
        cache = redis.from_url(settings.CELERY_BROKER_URL)
        return cache.llen('mdn_purgeable')
    else:
        raise ValueError('Not redis broker: %s' % settings.CELERY_BROKER_URL)


def null_notify_wait_purgable(event, count, limit):
    """Throw away purgable count."""
    pass


def verbose_notify_wait_purgable(event, count, limit):
    """Print purgable count."""
    print("Purgable queue %s: Target depth %d, Current depth %d" % (event, limit, count))


def wait_purgable(limit=1, notifier_func=None):
    """
    Wait for the purgable queue to empty out.
    """
    assert limit >= 0
    if not notifier_func:
        notifier_func = null_notify_wait_purgable
    try:
        count = purgable_count()
    except ValueError:
        notifier_func('not redis', -1, limit)
        sleep(5)
        return
    notifier_func('start', count, limit)
    if count < limit:
        return
    while count > limit:
        sleep(15)
        count = purgable_count()
        notifier_func('progress', count, limit)


def chunks(items, chunk_size):
    """Yield successive chunk_size-sized chunks from items."""
    for i in range(0, len(items), chunk_size):
        yield items[i:i + chunk_size]


def collect_doc_ids(docs, verbose=True, doc_filter=None):
    '''Collect the IDs of documents to rerender.'''
    raw_doc_ids = list(docs.order_by('id').values_list('id', flat=True))
    if doc_filter:
        if verbose:
            print("Processing %d documents for relevant docs..." % len(raw_doc_ids))
        doc_ids = []
        for doc_id in raw_doc_ids:
            doc = Document.objects.get(id=doc_id)
            if doc_filter(doc):
                doc_ids.append(doc_id)
        if verbose:
            print("%d of %d documents remain." % (len(doc_ids), len(raw_doc_ids)))
    else:
        doc_ids = raw_doc_ids[:]
    return doc_ids


def error_count(doc_ids):
    '''Count documents with KumaScript rendering errors.'''
    docs = (Document.objects
            .filter(id__in=doc_ids)
            .exclude(rendered_errors__isnull=True))
    return docs.count()


def rerender_slow(docs, verbose=True, limit=100, error_percent=10.0, doc_filter=None):
    '''Re-render a Document queryset a chunk at a time.

    Keyword arguments:
    docs - A queryset of Documents
    verbose - Be verbose
    limit - How many to rerender at a time
    error_percent - A float in range (0.0, 100.0], to abort due to KS errors.
    doc_filter - A further filter of doc instances

    Return: A tuple:
    - Total number of docs rendered
    - Total number of docs unrendered (stuck)
    - Total number of docs with kumascript errors
    - Time in seconds it took to re-render slowly
    '''
    start_time = time()
    if verbose:
        rerender_notify = verbose_notify_rerender_chunk
        wait_notify = verbose_notify_wait_purgable
    else:
        rerender_notify = wait_notify = None
    doc_ids = collect_doc_ids(docs, verbose, doc_filter)
    total = len(doc_ids)
    rendered, errored, unrendered, progress = 0, 0, 0, 0
    wait_purgable(notifier_func=wait_notify)
    for chunk in chunks(doc_ids, limit):
        progress += len(chunk)
        if verbose:
            percent = 100.0 * float(progress) / float(total)
            print("*** Rendering %d of %d docs (%0.1f%%)"
                  % (progress, total, percent))
        chunk_res = rerender_chunk(chunk, notifier_func=rerender_notify)
        rendered += chunk_res[0]
        unrendered += chunk_res[1]
        # Wait for purgable queue to clear
        wait_purgable(notifier_func=wait_notify)
        # Count errors
        new_errors = error_count(chunk)
        if new_errors and verbose:
            print("%d errored documents in last chunk." % new_errors)
        errored += new_errors
        error_limit = progress * error_percent / 100.0
        if errored >= error_limit:
            if verbose:
                print("%d of %d documents have errors, aborting."
                      % (errored, progress))
            return rendered, unrendered, errored, time() - start_time
    return rendered, unrendered, errored, time() - start_time


def macro_docs_and_filter(macro_name):
    def macro_filter(doc):
        return macro_name.lower() in [x.lower() for x in doc.extract.macro_names()]
    docs = Document.objects.filter(html__icontains=macro_name.lower())
    return docs, macro_filter


def rerender_macro_users(macro_name, verbose=True):
    docs, doc_filter = macro_docs_and_filter(macro_name)
    return rerender_slow(docs, verbose=verbose, doc_filter=doc_filter)


def macro_list_docs_and_filter(macro_names):
    assert len(macro_names) > 1
    lower_macro_names = [macro_name.lower() for macro_name in macro_names]
    def macros_filter(doc):
        doc_macros = [x.lower() for x in doc.extract.macro_names()]
        return any((macro in doc_macros) for macro in lower_macro_names)
    docs = Document.objects.filter(html__icontains=lower_macro_names[0])
    for name in lower_macro_names[1:]:
        docs |= Document.objects.filter(html__icontains=name)
    return docs, macros_filter


def rerender_users_of_macro_list(macro_names, verbose=True):
    docs, doc_filter = macro_list_docs_and_filter(macro_names)
    return rerender_slow(docs, verbose=verbose, doc_filter=doc_filter)


# Single macro version
# macro = 'CertifiedBadge'
rendered, unrendered, errored, seconds = rerender_macro_users(macro)

# https://github.com/mdn/kumascript/pull/789
# macros = ['APIRef', 'AddonSidebar', 'CSSRef', 'CanvasSidebar', 'DefaultAPISidebar', 'DocStatusQuickLinks', 'FirefoxOSAPIRef', 'FirefoxOSSidebar', 'FirefoxSidebar', 'GamesSidebar', 'HTMLMainQuickLinks', 'HTMLRef', 'HTMLSidebar', 'HTTPSidebar', 'JSSidebar', 'LearnSidebar', 'MDNSidebar', 'SVGRef', 'ServiceWorkerSidebar', 'SpiderMonkeySidebar', 'ToolsSidebar', 'WebAssemblySidebar', 'WebGLSidebar', 'WebRTCSidebar', 'eventref', 'jsctypesSidebar', 'nsprapiref']
# rendered, unrendered, errored, seconds = rerender_users_of_macro_list(macros)

print("Rendered %d docs, %d left unrendered, %d errored, in %d seconds." % (rendered, unrendered, errored, seconds))
	# From https://gist.github.com/jwhitlock/43e34e07bef8c3f1863e91f076778ca6
	from time import sleep, time

	import redis
	from celery.states import READY_STATES
	from django.conf import settings

	from kuma.wiki.models import Document
	from kuma.wiki.tasks import render_document


	def null_notify_rerender_chunk(event, doc_id, task):
	"""Throw away render events."""
	pass


	doc_urls = dict()


	def verbose_notify_rerender_chunk(event, doc_id, task):
	"""Print render events."""
	global doc_urls
	if doc_id not in doc_urls:
	doc = Document.objects.get(id=doc_id)
	doc_urls[doc_id] = doc.get_full_url()
	doc_url = doc_urls[doc_id]
	print("Render %s (%s): %d %s" % (event, task.state, doc_id, doc_url))


	def rerender_chunk(doc_ids, stuck_time=120, notifier_func=None):
	"""
	Queue a set of documents to re-render, and wait until they are done.

	Keyword Arguments:
	doc_ids - A sequence of document IDs to re-render
	stuck_time (120) - The time to wait for the last re-render to complete.
	notifier_func (None) - A function to call when a document event occurs.

	Return is a tuple of counts (documents rendered, documents unrendered)
	"""
	if not notifier_func:
	notifier_func = null_notify_rerender_chunk
	tasks = []
	total = len(doc_ids)
	for doc_id in doc_ids:
	task = render_document.delay(doc_id, "no-cache", None, force=True, invalidate_cdn_cache=False)
	# notifier_func('start', doc_id, task)
	tasks.append((doc_id, task, task.state, False))
	in_progress = len(doc_ids)
	stuck = 0
	while in_progress:
	last_in_progress = in_progress
	in_progress = 0
	next_tasks = []
	for doc_id, task, state, done in tasks:
	if not done:
	state = task.state
	if state in READY_STATES:
	done = True
	notifier_func('done', doc_id, task)
	else:
	in_progress += 1
	next_tasks.append((doc_id, task, state, done))
	tasks = next_tasks
	if last_in_progress == in_progress:
	stuck += 1
	else:
	stuck = 0
	if stuck >= stuck_time:
	for doc_id, task, state, done in tasks:
	if not done:
	notifier_func('stuck', doc_id, task)
	return (total - in_progress, in_progress)
	if in_progress:
	sleep(1)
	return total, 0


	def purgable_count():
	"""Return the number of tasks in the purgable queue."""
	if settings.CELERY_BROKER_URL.startswith('redis://'):
	cache = redis.from_url(settings.CELERY_BROKER_URL)
	return cache.llen('mdn_purgeable')
	else:
	raise ValueError('Not redis broker: %s' % settings.CELERY_BROKER_URL)


	def null_notify_wait_purgable(event, count, limit):
	"""Throw away purgable count."""
	pass


	def verbose_notify_wait_purgable(event, count, limit):
	"""Print purgable count."""
	print("Purgable queue %s: Target depth %d, Current depth %d" % (event, limit, count))


	def wait_purgable(limit=1, notifier_func=None):
	"""
	Wait for the purgable queue to empty out.
	"""
	assert limit >= 0
	if not notifier_func:
	notifier_func = null_notify_wait_purgable
	try:
	count = purgable_count()
	except ValueError:
	notifier_func('not redis', -1, limit)
	sleep(5)
	return
	notifier_func('start', count, limit)
	if count < limit:
	return
	while count > limit:
	sleep(15)
	count = purgable_count()
	notifier_func('progress', count, limit)


	def chunks(items, chunk_size):
	"""Yield successive chunk_size-sized chunks from items."""
	for i in range(0, len(items), chunk_size):
	yield items[i:i + chunk_size]


	def collect_doc_ids(docs, verbose=True, doc_filter=None):
	'''Collect the IDs of documents to rerender.'''
	raw_doc_ids = list(docs.order_by('id').values_list('id', flat=True))
	if doc_filter:
	if verbose:
	print("Processing %d documents for relevant docs..." % len(raw_doc_ids))
	doc_ids = []
	for doc_id in raw_doc_ids:
	doc = Document.objects.get(id=doc_id)
	if doc_filter(doc):
	doc_ids.append(doc_id)
	if verbose:
	print("%d of %d documents remain." % (len(doc_ids), len(raw_doc_ids)))
	else:
	doc_ids = raw_doc_ids[:]
	return doc_ids


	def error_count(doc_ids):
	'''Count documents with KumaScript rendering errors.'''
	docs = (Document.objects
	.filter(id__in=doc_ids)
	.exclude(rendered_errors__isnull=True))
	return docs.count()


	def rerender_slow(docs, verbose=True, limit=100, error_percent=10.0, doc_filter=None):
	'''Re-render a Document queryset a chunk at a time.

	Keyword arguments:
	docs - A queryset of Documents
	verbose - Be verbose
	limit - How many to rerender at a time
	error_percent - A float in range (0.0, 100.0], to abort due to KS errors.
	doc_filter - A further filter of doc instances

	Return: A tuple:
	- Total number of docs rendered
	- Total number of docs unrendered (stuck)
	- Total number of docs with kumascript errors
	- Time in seconds it took to re-render slowly
	'''
	start_time = time()
	if verbose:
	rerender_notify = verbose_notify_rerender_chunk
	wait_notify = verbose_notify_wait_purgable
	else:
	rerender_notify = wait_notify = None
	doc_ids = collect_doc_ids(docs, verbose, doc_filter)
	total = len(doc_ids)
	rendered, errored, unrendered, progress = 0, 0, 0, 0
	wait_purgable(notifier_func=wait_notify)
	for chunk in chunks(doc_ids, limit):
	progress += len(chunk)
	if verbose:
	percent = 100.0 * float(progress) / float(total)
	print("*** Rendering %d of %d docs (%0.1f%%)"
	% (progress, total, percent))
	chunk_res = rerender_chunk(chunk, notifier_func=rerender_notify)
	rendered += chunk_res[0]
	unrendered += chunk_res[1]
	# Wait for purgable queue to clear
	wait_purgable(notifier_func=wait_notify)
	# Count errors
	new_errors = error_count(chunk)
	if new_errors and verbose:
	print("%d errored documents in last chunk." % new_errors)
	errored += new_errors
	error_limit = progress * error_percent / 100.0
	if errored >= error_limit:
	if verbose:
	print("%d of %d documents have errors, aborting."
	% (errored, progress))
	return rendered, unrendered, errored, time() - start_time
	return rendered, unrendered, errored, time() - start_time


	def macro_docs_and_filter(macro_name):
	def macro_filter(doc):
	return macro_name.lower() in [x.lower() for x in doc.extract.macro_names()]
	docs = Document.objects.filter(html__icontains=macro_name.lower())
	return docs, macro_filter


	def rerender_macro_users(macro_name, verbose=True):
	docs, doc_filter = macro_docs_and_filter(macro_name)
	return rerender_slow(docs, verbose=verbose, doc_filter=doc_filter)


	def macro_list_docs_and_filter(macro_names):
	assert len(macro_names) > 1
	lower_macro_names = [macro_name.lower() for macro_name in macro_names]
	def macros_filter(doc):
	doc_macros = [x.lower() for x in doc.extract.macro_names()]
	return any((macro in doc_macros) for macro in lower_macro_names)
	docs = Document.objects.filter(html__icontains=lower_macro_names[0])
	for name in lower_macro_names[1:]:
	docs \|= Document.objects.filter(html__icontains=name)
	return docs, macros_filter


	def rerender_users_of_macro_list(macro_names, verbose=True):
	docs, doc_filter = macro_list_docs_and_filter(macro_names)
	return rerender_slow(docs, verbose=verbose, doc_filter=doc_filter)


	# Single macro version
	# macro = 'CertifiedBadge'
	rendered, unrendered, errored, seconds = rerender_macro_users(macro)

	# https://github.com/mdn/kumascript/pull/789
	# macros = ['APIRef', 'AddonSidebar', 'CSSRef', 'CanvasSidebar', 'DefaultAPISidebar', 'DocStatusQuickLinks', 'FirefoxOSAPIRef', 'FirefoxOSSidebar', 'FirefoxSidebar', 'GamesSidebar', 'HTMLMainQuickLinks', 'HTMLRef', 'HTMLSidebar', 'HTTPSidebar', 'JSSidebar', 'LearnSidebar', 'MDNSidebar', 'SVGRef', 'ServiceWorkerSidebar', 'SpiderMonkeySidebar', 'ToolsSidebar', 'WebAssemblySidebar', 'WebGLSidebar', 'WebRTCSidebar', 'eventref', 'jsctypesSidebar', 'nsprapiref']
	# rendered, unrendered, errored, seconds = rerender_users_of_macro_list(macros)

	print("Rendered %d docs, %d left unrendered, %d errored, in %d seconds." % (rendered, unrendered, errored, seconds))