lemon24/feedparser_memory_usage.py

## feedparser_memory_usage.py
"""
I used https://pythonspeed.com/products/filmemoryprofiler/
on https://github.com/lemon24/reader

Saw that feedparser accounted for a big part of the memory used.

The monkeypatch below avoids some encode()/decode() calls in the
encoding detection heuristics used in feedparser.encodings.convert_to_utf8();
https://github.com/kurtmckee/feedparser/blob/6.0.2/feedparser/encodings.py#L71

Instead of doing the heuristic on the full string,
it does it only on the first kilobytes of data,
and then uses the guessed encoding _once_ on the full string.

Ideally, we'd never read the whole feed in memory,
but it's not easy to make feedparser do this.

Results for my full database (~150 feeds), with all feeds marked as stale,
and 40 update workers:

* decreases process mem from 72 mb to 63 mb
* decreases feedparser.parse mem from 44.7m to 36.7m

After this monkeypatch, most of the memory is used in XML parsing.

"""
import feedparser.encodings

old_convert = feedparser.encodings.convert_to_utf8

def new_convert(http_headers, data, result):

    print('--- new_convert')

    fake_results = []
    for extra in range(4):
        fake_result = {}
        fake_results.append(fake_result)
        old_convert(http_headers, data[:2**15 + extra], fake_result)
        if not fake_result.get('bozo'):
            break
    else:
        fake_result = fake_results[0]

    encoding = fake_result.get('encoding')

    if not encoding:
        print('--- old_convert due to no encoding')
        return old_convert(http_headers, data, result)

    try:
        if encoding != 'utf-8':
            data = data.decode(encoding).encode('utf-8')
    except ValueError:
        print('--- old_convert due to encode/decode ValueError')
        return old_convert(http_headers, data, result)

    result.update(fake_result)
    return data

feedparser.encodings.convert_to_utf8 = new_convert
feedparser.api.convert_to_utf8 = new_convert
	"""
	I used https://pythonspeed.com/products/filmemoryprofiler/
	on https://github.com/lemon24/reader

	Saw that feedparser accounted for a big part of the memory used.

	The monkeypatch below avoids some encode()/decode() calls in the
	encoding detection heuristics used in feedparser.encodings.convert_to_utf8();
	https://github.com/kurtmckee/feedparser/blob/6.0.2/feedparser/encodings.py#L71

	Instead of doing the heuristic on the full string,
	it does it only on the first kilobytes of data,
	and then uses the guessed encoding _once_ on the full string.

	Ideally, we'd never read the whole feed in memory,
	but it's not easy to make feedparser do this.

	Results for my full database (~150 feeds), with all feeds marked as stale,
	and 40 update workers:

	* decreases process mem from 72 mb to 63 mb
	* decreases feedparser.parse mem from 44.7m to 36.7m

	After this monkeypatch, most of the memory is used in XML parsing.

	"""
	import feedparser.encodings

	old_convert = feedparser.encodings.convert_to_utf8

	def new_convert(http_headers, data, result):

	print('--- new_convert')

	fake_results = []
	for extra in range(4):
	fake_result = {}
	fake_results.append(fake_result)
	old_convert(http_headers, data[:2**15 + extra], fake_result)
	if not fake_result.get('bozo'):
	break
	else:
	fake_result = fake_results[0]

	encoding = fake_result.get('encoding')

	if not encoding:
	print('--- old_convert due to no encoding')
	return old_convert(http_headers, data, result)

	try:
	if encoding != 'utf-8':
	data = data.decode(encoding).encode('utf-8')
	except ValueError:
	print('--- old_convert due to encode/decode ValueError')
	return old_convert(http_headers, data, result)

	result.update(fake_result)
	return data

	feedparser.encodings.convert_to_utf8 = new_convert
	feedparser.api.convert_to_utf8 = new_convert