Skip to content

Instantly share code, notes, and snippets.

@snarkmaster
Created September 13, 2018 18:28
Show Gist options
  • Save snarkmaster/e426af8e7f22dcd4890704d86f9b12f9 to your computer and use it in GitHub Desktop.
Save snarkmaster/e426af8e7f22dcd4890704d86f9b12f9 to your computer and use it in GitHub Desktop.
Parsing RPM location href from the repo's primary XML
class _RpmLocationParser(AbstractContextManager):
'''
Parses through -primary.xml.gz and extracts `href` from each `location`.
NB: This XML parsing takes ~4 seconds, while it takes ~1.5 seconds to
unpack -primary.sqlite.bz2 and `SELECT location_href FROM packages`.
'''
def __init__(self):
self.decompressor = zlib.decompressobj(wbits=zlib.MAX_WBITS + 16)
self.xml_parser = XMLPullParser(['start'])
# ElementTree mangles the tags thus: '{xml_namespace}tag_name'
self.location_re = re.compile('({[^}]+}|)location')
# This context manager does not suppress exceptions.
def __exit__(self, exc_type, exc_val, exc_tb) -> None:
# Closing the parser breaks some circular refs to conserve RAM.
self.xml_parser.close()
def feed(self, chunk: bytes) -> Iterator[str]:
self.xml_parser.feed(self.decompressor.decompress(chunk))
for event, elt in self.xml_parser.read_events():
assert event == 'start'
if self.location_re.match(elt.tag):
yield elt.attrib['href']
@snarkmaster
Copy link
Author

I managed to make this ~2x faster by using Element.clear() and reducing the chunk size that I feed into XMLPullParser:

class XMLRpmParser(AbstractContextManager):
    '''
    Extracts RPM location, checksum, and size from -primary.xml.gz.  See the
    docblock of `SQLiteRpmParser` to learn why this parser is dispreferred,
    and why it exists anyway.
    '''
    
    def __init__(self):
        self.decompressor = zlib.decompressobj(wbits=zlib.MAX_WBITS + 16)
        self.xml_parser = ElementTree.XMLPullParser(['end'])
        # ElementTree mangles the tags thus: '{xml_namespace}tag_name'
        self.tag_re = re.compile('({[^}]+}|)(location|size|checksum|package)$')

    # This context manager does not suppress exceptions.
    def __exit__(self, exc_type, exc_val, exc_tb) -> None:
        # Closing the parser detects incomplete XML files. It also breaks
        # some circular refs to speed up GC.
        self.xml_parser.close()

    def feed(self, chunk: bytes) -> Iterator[Rpm]:
        location, size, checksum = None, None, None
        while chunk:
            # Consume the decompressed data in small chunks. This prevents
            # us from using unbounded amounts of RAM for decompression. 
            # More crucially, apparently XMLPullParser gets up to 50% slower
            # on package data if we feed it larger chuks.  This buffer size
            # was picked experimentally :)  
            #
            # NB: zlib appears to copy bytes into `unconsumed_tail` instead
            # of using something like `memoryview`, so this has poor
            # theoretical complexity due to all the extra copying.  I could
            # add an extra layer of input chunking to mitigate this, but in
            # practice it seems ok to just limit the incoming chunk size. 
            self.xml_parser.feed(self.decompressor.decompress(
                chunk, max_length=2 ** 14,
            ))
            chunk = self.decompressor.unconsumed_tail
            for event, elt  in self.xml_parser.read_events():
                m = self.tag_re.match(elt.tag)
                if m:
                    if m.group(2) == 'location':
                        location = elt.attrib['href']
                    elif m.group(2) == 'size':
                        size = elt.attrib['package']
                    elif m.group(2) == 'checksum':
                        assert elt.attrib['pkgid'] == 'YES'
                        checksum = elt.attrib['type'], elt.text
                    elif m.group(2) == 'package':
                        yield Rpm(
                            location=location,
                            size=int(size),
                            checksum_type=checksum[0],
                            checksum_value=checksum[1],
                        )
                        location, size, checksum = None, None, None
                        elt.clear()  # Uses less RAM, speeds up the run 50%

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment