snarkmaster/rpm_location_parser.py

## rpm_location_parser.py
class _RpmLocationParser(AbstractContextManager):
    '''
    Parses through -primary.xml.gz and extracts `href` from each `location`.
    NB: This XML parsing takes ~4 seconds, while it takes ~1.5 seconds to
    unpack -primary.sqlite.bz2 and `SELECT location_href FROM packages`.
    '''

    def __init__(self):
        self.decompressor = zlib.decompressobj(wbits=zlib.MAX_WBITS + 16)
        self.xml_parser = XMLPullParser(['start'])
        # ElementTree mangles the tags thus: '{xml_namespace}tag_name'
        self.location_re = re.compile('({[^}]+}|)location')

    # This context manager does not suppress exceptions.
    def __exit__(self, exc_type, exc_val, exc_tb) -> None:
        # Closing the parser breaks some circular refs to conserve RAM.
        self.xml_parser.close()

    def feed(self, chunk: bytes) -> Iterator[str]:
        self.xml_parser.feed(self.decompressor.decompress(chunk))
        for event, elt  in self.xml_parser.read_events():
            assert event == 'start'
            if self.location_re.match(elt.tag):
                yield elt.attrib['href']
	class _RpmLocationParser(AbstractContextManager):
	'''
	Parses through -primary.xml.gz and extracts `href` from each `location`.
	NB: This XML parsing takes ~4 seconds, while it takes ~1.5 seconds to
	unpack -primary.sqlite.bz2 and `SELECT location_href FROM packages`.
	'''

	def __init__(self):
	self.decompressor = zlib.decompressobj(wbits=zlib.MAX_WBITS + 16)
	self.xml_parser = XMLPullParser(['start'])
	# ElementTree mangles the tags thus: '{xml_namespace}tag_name'
	self.location_re = re.compile('({[^}]+}\|)location')

	# This context manager does not suppress exceptions.
	def __exit__(self, exc_type, exc_val, exc_tb) -> None:
	# Closing the parser breaks some circular refs to conserve RAM.
	self.xml_parser.close()

	def feed(self, chunk: bytes) -> Iterator[str]:
	self.xml_parser.feed(self.decompressor.decompress(chunk))
	for event, elt in self.xml_parser.read_events():
	assert event == 'start'
	if self.location_re.match(elt.tag):
	yield elt.attrib['href']