lemon24/betterfeedparser.py

## betterfeedparser.py
"""
Working of off feedparser 6.0.8.

To do:

* [x] don't read entire file in memory
  * [x] allow fallback to reading entire file in memory
* allow using alternative (defusedxml, lxml) sax handlers
* [x] fix content order bug

* [~] fix loose parsing
  * still need to document that fallback only works for seekable streams

* full-featured parse()

"""


import io
import codecs
import xml.sax

import feedparser as fp
from feedparser.encodings import convert_to_utf8
from feedparser.sanitizer import replace_doctype
from feedparser.urls import make_safe_absolute_uri
from feedparser.util import FeedParserDict

from prefixfilewrapper import PrefixFileWrapper, StreamFactory, MissingEncoding


def convert_file_prefix_to_utf8(http_headers, file, result):
    # based on https://gist.github.com/lemon24/dbe0f5f0cad3be3e1646c61cb026061d

    prefix_len = 2**12

    prefix = file.read(prefix_len)

    # we call convert_to_utf8() up to 4 times,
    # to make sure we eventually land on a code point boundary
    for _ in range(4):
        fake_result = {}
        converted_prefix = convert_to_utf8(http_headers, prefix, fake_result)
        if not fake_result.get('bozo'):
            break

        # check if the prefix we have is actually the whole thing
        if len(prefix) < prefix_len:
            break

        byte = file.read(1)
        if not byte:
            break

        prefix += byte
        prefix_len += 1

    result.update(fake_result)
    return converted_prefix


OPTIMISTIC_ENCODING_DETECTION = True

def convert_file_to_utf8(http_headers, file, result, optimistic_encoding_detection):
    if optimistic_encoding_detection:
        prefix = convert_file_prefix_to_utf8(http_headers, file, result)
        result['version'], prefix, entities = replace_doctype(prefix)
        file = PrefixFileWrapper(prefix, file)

    else:
        # this shouldn't increase memory usage if file is BytesIO,
        # since BytesIO does copy-on-write; https://bugs.python.org/issue22003
        data = convert_to_utf8(http_headers, file.read(), result)
        result['version'], data, entities = replace_doctype(data)
        # still need to be able to reset() to the "beginning"
        file = PrefixFileWrapper(b'', io.BytesIO(data))

    return StreamFactory(file, result.get('encoding')), entities


def parse(
    file,
    response_headers=None,
    resolve_relative_uris=None,
    sanitize_html=None,
    optimistic_encoding_detection=None,
):
    # similar to the original code

    if sanitize_html is None:
        sanitize_html = fp.SANITIZE_HTML
    if resolve_relative_uris is None:
        resolve_relative_uris = fp.RESOLVE_RELATIVE_URIS
    if optimistic_encoding_detection is None:
        optimistic_encoding_detection = OPTIMISTIC_ENCODING_DETECTION

    result = FeedParserDict(
        bozo=False,
        entries=[],
        feed=FeedParserDict(),
        headers={},
    )

    result['headers'].update(response_headers or {})

    original_file = file
    stream_factory, entities = convert_file_to_utf8(result['headers'], file, result, optimistic_encoding_detection)

    # at this point, if the original file was seekable,
    # file.reset() will seek the original file to its initial tell();
    # also, file.close() will be ignored
    # (because the sax parser closes the file when done, and we don't want that)
    # TODO: when implementing parse() for real, if the file object is not from the user, close it

    # similar to the original code

    use_strict_parser = result['encoding'] and True or False

    contentloc = result['headers'].get('content-location', '')
    href = result.get('href', '')
    baseuri = make_safe_absolute_uri(href, contentloc) or make_safe_absolute_uri(contentloc) or href

    baselang = result['headers'].get('content-language', None)
    if isinstance(baselang, bytes) and baselang is not None:
        baselang = baselang.decode('utf-8', 'ignore')

    if not fp.api._XML_AVAILABLE:
        use_strict_parser = 0

    if use_strict_parser:
        feedparser = StrictFeedParser(baseuri, baselang, 'utf-8')
        feedparser.resolve_relative_uris = resolve_relative_uris
        feedparser.sanitize_html = sanitize_html
        saxparser = xml.sax.make_parser(fp.api.PREFERRED_XML_PARSERS)
        saxparser.setFeature(xml.sax.handler.feature_namespaces, 1)
        try:
            saxparser.setFeature(xml.sax.handler.feature_external_ges, 0)
        except xml.sax.SAXNotSupportedException:
            pass
        saxparser.setContentHandler(feedparser)
        saxparser.setErrorHandler(feedparser)
        source = xml.sax.xmlreader.InputSource()

        # if an encoding was detected, decode the file on the fly;
        # otherwise, pass it as-is and let the SAX parser deal with it
        try:
            source.setCharacterStream(stream_factory.get_text_file())
        except MissingEncoding:
            source.setByteStream(stream_factory.get_binary_file())

        try:
            saxparser.parse(source)
        except xml.sax.SAXException as e:
            result['bozo'] = 1
            result['bozo_exception'] = feedparser.exc or e
            use_strict_parser = 0

    if not use_strict_parser:
        # falling back to the loose parser only works with seekable files:
        #   io.UnsupportedOperation: underlying stream is not seekable

        # decode the file on the fly;
        # if an encoding was detected, use it;
        # otherwise assume utf-8 and do your best

        data =  stream_factory.get_text_file('utf-8', 'replace').read()

        # similar to the original code

        feedparser = LooseFeedParser(baseuri, baselang, 'utf-8', entities)
        feedparser.resolve_relative_uris = resolve_relative_uris
        feedparser.sanitize_html = sanitize_html
        feedparser.feed(data)

    result['feed'] = feedparser.feeddata
    result['entries'] = feedparser.entries
    result['version'] = result['version'] or feedparser.version
    result['namespaces'] = feedparser.namespaces_in_use
    return result


class _SummaryMixin:

    def _start_summary(self, attrs_d):
        if not self.version.startswith('atom'):
            return super()._start_summary(attrs_d)
        self._summaryKey = 'summary'
        self.push_content(self._summaryKey, attrs_d, 'text/plain', 1)

class StrictFeedParser(_SummaryMixin, fp.api.StrictFeedParser): pass
class LooseFeedParser(_SummaryMixin, fp.api.LooseFeedParser): pass


"""

# after passing file as-is to source (no encoding handling, no loose parsing)

error: _feeds/https-www-reddit-com-r-oilshell-rss.rss: loose parser not implemented
better 11.6 28
feedparser 13.9 56
noop 0.0 18

# after we convert_to_utf8() only the prefix and avoid reading the whole file

error: _feeds/https-www-reddit-com-r-oilshell-rss.rss: loose parser not implemented
better 10.1 32
feedparser 10.6 60
noop 0.0 19

better results are the same as for feedparser

# after PrefixFileWrapper and optimistic_encoding_detection

error: _feeds/https-www-reddit-com-r-oilshell-rss.rss: 'PrefixFileWrapper' object has no attribute 'seek'
better 10.1 31
feedparser 10.6 60
noop 0.0 19

# after StreamFactory

better 10.9 33.2
better_bytes 10.4 33.9
feedparser 10.8 60.6
noop 0.0 20.0


"""


if __name__ == "__main__":
    from textwrap import dedent
    from pprint import pprint, pformat
    import sys, feedparser, difflib

    lines = dedent("""\
        <?xml version="1.0" encoding="utf-8"?>
        <?xml version="1.0" encoding="UTF-8"?>
        <?xml version="1.0" encoding="UTF-8" ?>
        <?xml version="1.0" encoding="utf-8" standalone="yes" ?>
        <rss version="2.0" xmlns:atom="http://www.w3.org/2005/Atom">
        <?xml version="1.0"?>
        <rss version='2.0'>
    """).splitlines()

    """
    for line in lines:
        line = line.encode('gbk')
        result = {}
        headers = {'content-type': 'application/xml; charset=ms932'}
        data = convert_to_utf8(headers, line, result)
        print(line, data, result, sep='\n', end='\n\n')
    """

    """
    for path in sys.stdin:
        path = path.rstrip()

        with open(path, 'rb') as f:
            original = feedparser.parse(f)
            f.seek(0)
            try:
                better = parse(f)
            except NotImplementedError:
                continue

            if original != better:
                content_equal = []

                for eo, eb in zip(original.entries, better.entries):
                    eoa = ([eo.summary] if eo.summary else []) + [c.value for c in eo.content]
                    eba = ([eb.summary] if eb.summary else []) + [c.value for c in eb.content]
                    content_equal.append(set(eoa) == set(eba))

                if all(content_equal):
                    continue

                print('===', path)
                print(*difflib.ndiff(pformat(original).splitlines(), pformat(better).splitlines()), sep='\n')

    # the only one that's different is _feeds/https-sobolevn-me-feed-xml.atom
    # but i checked it by hand and it looks OK (it has both content and summary)

    """

    with open('index.xml', 'rb') as f:
        pprint(parse(f))


## memory.py
import sys, time, resource, io
import feedparser, atoma, betterfeedparser

def feedparser_parse(path, file):
    return feedparser.parse(
        file,
        resolve_relative_uris=False,
        sanitize_html=False,
    )

def better_parse(path, file):
    return betterfeedparser.parse(
        file,
        resolve_relative_uris=False,
        sanitize_html=False,
    )

def better_bytes_parse(path, file):
    return betterfeedparser.parse(
        io.BytesIO(file.read()),
        resolve_relative_uris=False,
        sanitize_html=False,
    )

def atoma_parse(path, file):
    return getattr(atoma, f'parse_{path.rpartition(".")[2]}_file')(file)

def noop_parse(*_): pass

impl = sys.argv[1]
parse = locals()[f'{impl}_parse']

timings = 0
for line in sys.stdin:
    path = line.rstrip()
    with open(path, 'rb') as file:
        try:
            start = time.perf_counter()
            parse(path, file)
            end = time.perf_counter()
            timings += end - start
        except Exception as e:
            print(f'error: {path}: {e}', file=sys.stderr)


maxrss = (
    resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
    / 2 ** (20 if sys.platform == 'darwin' else 10)
)

print(impl, round(timings, 1), round(maxrss, 1))

## prefixfilewrapper.py
import codecs
import io


class PrefixFileWrapper:
    """

    >>> import io
    >>> file = io.StringIO('abcdef')
    >>> file.read(2)
    'ab'
    >>> wrapped = PrefixFileWrapper(file.read(2).upper(), file)
    >>> wrapped.read()
    'CDef'
    >>> wrapped.reset()
    >>> wrapped.read()
    'CDef'
    >>>

    """

    def __init__(self, prefix, file):
        self.prefix = prefix
        self.file = file

        try:
            self.file_initial_offset = file.tell()
        except OSError:
            self.file_initial_offset = None

        self.offset = 0

    def reset(self):
        # raises io.UnsupportedOperation if the underlying stream is not seekable
        self.file.seek(self.file_initial_offset)
        self.offset = 0

    def read(self, size=-1):
        buffer = self.file.read(0)

        if self.offset < len(self.prefix):
            if size < 0:
                chunk = self.prefix
            else:
                chunk = self.prefix[self.offset : self.offset+size]
                size -= len(chunk)
            buffer += chunk
            self.offset += len(chunk)

        while True:
            chunk = self.file.read(size)
            if not chunk:
                break
            buffer += chunk
            self.offset += len(chunk)

            if size <= 0:
                break

            size -= len(chunk)

        return buffer

    def close(self):
        pass


class MissingEncoding(io.UnsupportedOperation): pass

class StreamFactory:

    def __init__(self, file, encoding=None):
        self.file = file
        self.encoding = encoding
        self.should_reset = False

    def get_text_file(self, fallback_encoding=None, errors='strict'):
        encoding = self.encoding or fallback_encoding
        if encoding is None:
            raise MissingEncoding("cannot create text stream without encoding")
        reader_factory = codecs.getreader(encoding)
        reader = reader_factory(self.file, errors)
        self.reset()
        return reader

    def get_binary_file():
        self.reset()
        return self.file

    def reset(self):
        if self.should_reset:
            self.file.reset()
        self.should_reset = True


import io
# commented out so it doesn't mess with memory measurements
#import pytest

def make_file_in_the_middle(data):
    prefix = b'zzzzz'
    rv = io.BytesIO(prefix + data)
    rv.seek(len(prefix))
    return rv

class make_file_one_by_one(io.BytesIO):
    def read(self, size=-1):
        if size <= 0:
            return super().read(size)
        return super().read(1)

"""
@pytest.mark.parametrize('make_file', [
    io.BytesIO,
    make_file_in_the_middle,
    make_file_one_by_one,
])
"""
def test_pfw(make_file):
    f = PrefixFileWrapper(b'abc', make_file(b'def'))

    assert f.read() == b'abcdef'
    assert f.read() == b''

    f.reset()
    assert f.read(2) == b'ab'
    assert f.read(2) == b'cd'
    assert f.read(2) == b'ef'
    assert f.read(2) == b''
    assert f.read() == b''

    f.reset()
    assert f.read(3) == b'abc'
    assert f.read(3) == b'def'
    assert f.read(3) == b''
    assert f.read() == b''

    f.reset()
    assert f.read(0) == b''
    assert f.read() == b'abcdef'

    f.reset()
    f.reset()
    assert f.read() == b'abcdef'


class make_file_not_seekable(io.BytesIO):
    def tell(self):
        raise io.UnsupportedOperation
    def seek(self, *args):
        raise io.UnsupportedOperation

def test_pfw_not_seekable():
    f = PrefixFileWrapper(b'abc', make_file_not_seekable(b'def'))

    assert f.read() == b'abcdef'
    assert f.read() == b''
    with pytest.raises(io.UnsupportedOperation):
        f.reset()
    assert f.read() == b''

    f = PrefixFileWrapper(b'abc', make_file_not_seekable(b'def'))

    assert f.read(3) == b'abc'
    with pytest.raises(io.UnsupportedOperation):
        f.reset()
    assert f.read() == b'def'

def test_pfw_no_prefix():
    f = PrefixFileWrapper(b'', io.BytesIO(b'abc'))
    assert f.read(1) == b'a'
    assert f.read() == b'bc'

    f.reset()
    assert f.read() == b'abc'
	"""
	Working of off feedparser 6.0.8.

	To do:

	* [x] don't read entire file in memory
	* [x] allow fallback to reading entire file in memory
	* allow using alternative (defusedxml, lxml) sax handlers
	* [x] fix content order bug

	* [~] fix loose parsing
	* still need to document that fallback only works for seekable streams

	* full-featured parse()

	"""


	import io
	import codecs
	import xml.sax

	import feedparser as fp
	from feedparser.encodings import convert_to_utf8
	from feedparser.sanitizer import replace_doctype
	from feedparser.urls import make_safe_absolute_uri
	from feedparser.util import FeedParserDict

	from prefixfilewrapper import PrefixFileWrapper, StreamFactory, MissingEncoding


	def convert_file_prefix_to_utf8(http_headers, file, result):
	# based on https://gist.github.com/lemon24/dbe0f5f0cad3be3e1646c61cb026061d

	prefix_len = 2**12

	prefix = file.read(prefix_len)

	# we call convert_to_utf8() up to 4 times,
	# to make sure we eventually land on a code point boundary
	for _ in range(4):
	fake_result = {}
	converted_prefix = convert_to_utf8(http_headers, prefix, fake_result)
	if not fake_result.get('bozo'):
	break

	# check if the prefix we have is actually the whole thing
	if len(prefix) < prefix_len:
	break

	byte = file.read(1)
	if not byte:
	break

	prefix += byte
	prefix_len += 1

	result.update(fake_result)
	return converted_prefix



	OPTIMISTIC_ENCODING_DETECTION = True

	def convert_file_to_utf8(http_headers, file, result, optimistic_encoding_detection):
	if optimistic_encoding_detection:
	prefix = convert_file_prefix_to_utf8(http_headers, file, result)
	result['version'], prefix, entities = replace_doctype(prefix)
	file = PrefixFileWrapper(prefix, file)

	else:
	# this shouldn't increase memory usage if file is BytesIO,
	# since BytesIO does copy-on-write; https://bugs.python.org/issue22003
	data = convert_to_utf8(http_headers, file.read(), result)
	result['version'], data, entities = replace_doctype(data)
	# still need to be able to reset() to the "beginning"
	file = PrefixFileWrapper(b'', io.BytesIO(data))

	return StreamFactory(file, result.get('encoding')), entities


	def parse(
	file,
	response_headers=None,
	resolve_relative_uris=None,
	sanitize_html=None,
	optimistic_encoding_detection=None,
	):
	# similar to the original code

	if sanitize_html is None:
	sanitize_html = fp.SANITIZE_HTML
	if resolve_relative_uris is None:
	resolve_relative_uris = fp.RESOLVE_RELATIVE_URIS
	if optimistic_encoding_detection is None:
	optimistic_encoding_detection = OPTIMISTIC_ENCODING_DETECTION

	result = FeedParserDict(
	bozo=False,
	entries=[],
	feed=FeedParserDict(),
	headers={},
	)

	result['headers'].update(response_headers or {})

	original_file = file
	stream_factory, entities = convert_file_to_utf8(result['headers'], file, result, optimistic_encoding_detection)

	# at this point, if the original file was seekable,
	# file.reset() will seek the original file to its initial tell();
	# also, file.close() will be ignored
	# (because the sax parser closes the file when done, and we don't want that)
	# TODO: when implementing parse() for real, if the file object is not from the user, close it

	# similar to the original code

	use_strict_parser = result['encoding'] and True or False

	contentloc = result['headers'].get('content-location', '')
	href = result.get('href', '')
	baseuri = make_safe_absolute_uri(href, contentloc) or make_safe_absolute_uri(contentloc) or href

	baselang = result['headers'].get('content-language', None)
	if isinstance(baselang, bytes) and baselang is not None:
	baselang = baselang.decode('utf-8', 'ignore')

	if not fp.api._XML_AVAILABLE:
	use_strict_parser = 0

	if use_strict_parser:
	feedparser = StrictFeedParser(baseuri, baselang, 'utf-8')
	feedparser.resolve_relative_uris = resolve_relative_uris
	feedparser.sanitize_html = sanitize_html
	saxparser = xml.sax.make_parser(fp.api.PREFERRED_XML_PARSERS)
	saxparser.setFeature(xml.sax.handler.feature_namespaces, 1)
	try:
	saxparser.setFeature(xml.sax.handler.feature_external_ges, 0)
	except xml.sax.SAXNotSupportedException:
	pass
	saxparser.setContentHandler(feedparser)
	saxparser.setErrorHandler(feedparser)
	source = xml.sax.xmlreader.InputSource()

	# if an encoding was detected, decode the file on the fly;
	# otherwise, pass it as-is and let the SAX parser deal with it
	try:
	source.setCharacterStream(stream_factory.get_text_file())
	except MissingEncoding:
	source.setByteStream(stream_factory.get_binary_file())

	try:
	saxparser.parse(source)
	except xml.sax.SAXException as e:
	result['bozo'] = 1
	result['bozo_exception'] = feedparser.exc or e
	use_strict_parser = 0

	if not use_strict_parser:
	# falling back to the loose parser only works with seekable files:
	# io.UnsupportedOperation: underlying stream is not seekable

	# decode the file on the fly;
	# if an encoding was detected, use it;
	# otherwise assume utf-8 and do your best

	data = stream_factory.get_text_file('utf-8', 'replace').read()

	# similar to the original code

	feedparser = LooseFeedParser(baseuri, baselang, 'utf-8', entities)
	feedparser.resolve_relative_uris = resolve_relative_uris
	feedparser.sanitize_html = sanitize_html
	feedparser.feed(data)

	result['feed'] = feedparser.feeddata
	result['entries'] = feedparser.entries
	result['version'] = result['version'] or feedparser.version
	result['namespaces'] = feedparser.namespaces_in_use
	return result


	class _SummaryMixin:

	def _start_summary(self, attrs_d):
	if not self.version.startswith('atom'):
	return super()._start_summary(attrs_d)
	self._summaryKey = 'summary'
	self.push_content(self._summaryKey, attrs_d, 'text/plain', 1)

	class StrictFeedParser(_SummaryMixin, fp.api.StrictFeedParser): pass
	class LooseFeedParser(_SummaryMixin, fp.api.LooseFeedParser): pass



	"""

	# after passing file as-is to source (no encoding handling, no loose parsing)

	error: _feeds/https-www-reddit-com-r-oilshell-rss.rss: loose parser not implemented
	better 11.6 28
	feedparser 13.9 56
	noop 0.0 18

	# after we convert_to_utf8() only the prefix and avoid reading the whole file

	error: _feeds/https-www-reddit-com-r-oilshell-rss.rss: loose parser not implemented
	better 10.1 32
	feedparser 10.6 60
	noop 0.0 19

	better results are the same as for feedparser

	# after PrefixFileWrapper and optimistic_encoding_detection

	error: _feeds/https-www-reddit-com-r-oilshell-rss.rss: 'PrefixFileWrapper' object has no attribute 'seek'
	better 10.1 31
	feedparser 10.6 60
	noop 0.0 19

	# after StreamFactory

	better 10.9 33.2
	better_bytes 10.4 33.9
	feedparser 10.8 60.6
	noop 0.0 20.0


	"""




	if __name__ == "__main__":
	from textwrap import dedent
	from pprint import pprint, pformat
	import sys, feedparser, difflib

	lines = dedent("""\
	<?xml version="1.0" encoding="utf-8"?>
	<?xml version="1.0" encoding="UTF-8"?>
	<?xml version="1.0" encoding="UTF-8" ?>
	<?xml version="1.0" encoding="utf-8" standalone="yes" ?>
	<rss version="2.0" xmlns:atom="http://www.w3.org/2005/Atom">
	<?xml version="1.0"?>
	<rss version='2.0'>
	""").splitlines()

	"""
	for line in lines:
	line = line.encode('gbk')
	result = {}
	headers = {'content-type': 'application/xml; charset=ms932'}
	data = convert_to_utf8(headers, line, result)
	print(line, data, result, sep='\n', end='\n\n')
	"""

	"""
	for path in sys.stdin:
	path = path.rstrip()

	with open(path, 'rb') as f:
	original = feedparser.parse(f)
	f.seek(0)
	try:
	better = parse(f)
	except NotImplementedError:
	continue

	if original != better:
	content_equal = []

	for eo, eb in zip(original.entries, better.entries):
	eoa = ([eo.summary] if eo.summary else []) + [c.value for c in eo.content]
	eba = ([eb.summary] if eb.summary else []) + [c.value for c in eb.content]
	content_equal.append(set(eoa) == set(eba))

	if all(content_equal):
	continue

	print('===', path)
	print(*difflib.ndiff(pformat(original).splitlines(), pformat(better).splitlines()), sep='\n')

	# the only one that's different is _feeds/https-sobolevn-me-feed-xml.atom
	# but i checked it by hand and it looks OK (it has both content and summary)

	"""

	with open('index.xml', 'rb') as f:
	pprint(parse(f))
	import sys, time, resource, io
	import feedparser, atoma, betterfeedparser

	def feedparser_parse(path, file):
	return feedparser.parse(
	file,
	resolve_relative_uris=False,
	sanitize_html=False,
	)

	def better_parse(path, file):
	return betterfeedparser.parse(
	file,
	resolve_relative_uris=False,
	sanitize_html=False,
	)

	def better_bytes_parse(path, file):
	return betterfeedparser.parse(
	io.BytesIO(file.read()),
	resolve_relative_uris=False,
	sanitize_html=False,
	)

	def atoma_parse(path, file):
	return getattr(atoma, f'parse_{path.rpartition(".")[2]}_file')(file)

	def noop_parse(*_): pass

	impl = sys.argv[1]
	parse = locals()[f'{impl}_parse']

	timings = 0
	for line in sys.stdin:
	path = line.rstrip()
	with open(path, 'rb') as file:
	try:
	start = time.perf_counter()
	parse(path, file)
	end = time.perf_counter()
	timings += end - start
	except Exception as e:
	print(f'error: {path}: {e}', file=sys.stderr)


	maxrss = (
	resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
	/ 2 ** (20 if sys.platform == 'darwin' else 10)
	)

	print(impl, round(timings, 1), round(maxrss, 1))
	import codecs
	import io


	class PrefixFileWrapper:
	"""

	>>> import io
	>>> file = io.StringIO('abcdef')
	>>> file.read(2)
	'ab'
	>>> wrapped = PrefixFileWrapper(file.read(2).upper(), file)
	>>> wrapped.read()
	'CDef'
	>>> wrapped.reset()
	>>> wrapped.read()
	'CDef'
	>>>

	"""

	def __init__(self, prefix, file):
	self.prefix = prefix
	self.file = file

	try:
	self.file_initial_offset = file.tell()
	except OSError:
	self.file_initial_offset = None

	self.offset = 0

	def reset(self):
	# raises io.UnsupportedOperation if the underlying stream is not seekable
	self.file.seek(self.file_initial_offset)
	self.offset = 0

	def read(self, size=-1):
	buffer = self.file.read(0)

	if self.offset < len(self.prefix):
	if size < 0:
	chunk = self.prefix
	else:
	chunk = self.prefix[self.offset : self.offset+size]
	size -= len(chunk)
	buffer += chunk
	self.offset += len(chunk)

	while True:
	chunk = self.file.read(size)
	if not chunk:
	break
	buffer += chunk
	self.offset += len(chunk)

	if size <= 0:
	break

	size -= len(chunk)

	return buffer

	def close(self):
	pass


	class MissingEncoding(io.UnsupportedOperation): pass

	class StreamFactory:

	def __init__(self, file, encoding=None):
	self.file = file
	self.encoding = encoding
	self.should_reset = False

	def get_text_file(self, fallback_encoding=None, errors='strict'):
	encoding = self.encoding or fallback_encoding
	if encoding is None:
	raise MissingEncoding("cannot create text stream without encoding")
	reader_factory = codecs.getreader(encoding)
	reader = reader_factory(self.file, errors)
	self.reset()
	return reader

	def get_binary_file():
	self.reset()
	return self.file

	def reset(self):
	if self.should_reset:
	self.file.reset()
	self.should_reset = True


	import io
	# commented out so it doesn't mess with memory measurements
	#import pytest

	def make_file_in_the_middle(data):
	prefix = b'zzzzz'
	rv = io.BytesIO(prefix + data)
	rv.seek(len(prefix))
	return rv

	class make_file_one_by_one(io.BytesIO):
	def read(self, size=-1):
	if size <= 0:
	return super().read(size)
	return super().read(1)

	"""
	@pytest.mark.parametrize('make_file', [
	io.BytesIO,
	make_file_in_the_middle,
	make_file_one_by_one,
	])
	"""
	def test_pfw(make_file):
	f = PrefixFileWrapper(b'abc', make_file(b'def'))

	assert f.read() == b'abcdef'
	assert f.read() == b''

	f.reset()
	assert f.read(2) == b'ab'
	assert f.read(2) == b'cd'
	assert f.read(2) == b'ef'
	assert f.read(2) == b''
	assert f.read() == b''

	f.reset()
	assert f.read(3) == b'abc'
	assert f.read(3) == b'def'
	assert f.read(3) == b''
	assert f.read() == b''

	f.reset()
	assert f.read(0) == b''
	assert f.read() == b'abcdef'

	f.reset()
	f.reset()
	assert f.read() == b'abcdef'


	class make_file_not_seekable(io.BytesIO):
	def tell(self):
	raise io.UnsupportedOperation
	def seek(self, *args):
	raise io.UnsupportedOperation

	def test_pfw_not_seekable():
	f = PrefixFileWrapper(b'abc', make_file_not_seekable(b'def'))

	assert f.read() == b'abcdef'
	assert f.read() == b''
	with pytest.raises(io.UnsupportedOperation):
	f.reset()
	assert f.read() == b''

	f = PrefixFileWrapper(b'abc', make_file_not_seekable(b'def'))

	assert f.read(3) == b'abc'
	with pytest.raises(io.UnsupportedOperation):
	f.reset()
	assert f.read() == b'def'

	def test_pfw_no_prefix():
	f = PrefixFileWrapper(b'', io.BytesIO(b'abc'))
	assert f.read(1) == b'a'
	assert f.read() == b'bc'

	f.reset()
	assert f.read() == b'abc'