Skip to content

Instantly share code, notes, and snippets.

@snarkmaster
Created September 13, 2018 18:28
Show Gist options
  • Save snarkmaster/e426af8e7f22dcd4890704d86f9b12f9 to your computer and use it in GitHub Desktop.
Save snarkmaster/e426af8e7f22dcd4890704d86f9b12f9 to your computer and use it in GitHub Desktop.
Parsing RPM location href from the repo's primary XML
class _RpmLocationParser(AbstractContextManager):
'''
Parses through -primary.xml.gz and extracts `href` from each `location`.
NB: This XML parsing takes ~4 seconds, while it takes ~1.5 seconds to
unpack -primary.sqlite.bz2 and `SELECT location_href FROM packages`.
'''
def __init__(self):
self.decompressor = zlib.decompressobj(wbits=zlib.MAX_WBITS + 16)
self.xml_parser = XMLPullParser(['start'])
# ElementTree mangles the tags thus: '{xml_namespace}tag_name'
self.location_re = re.compile('({[^}]+}|)location')
# This context manager does not suppress exceptions.
def __exit__(self, exc_type, exc_val, exc_tb) -> None:
# Closing the parser breaks some circular refs to conserve RAM.
self.xml_parser.close()
def feed(self, chunk: bytes) -> Iterator[str]:
self.xml_parser.feed(self.decompressor.decompress(chunk))
for event, elt in self.xml_parser.read_events():
assert event == 'start'
if self.location_re.match(elt.tag):
yield elt.attrib['href']
@snarkmaster
Copy link
Author

For the record, this is the discarded solution. If you are here, you want to get your data from the .sqilte.bz2, since it's faster and easier to query.

@snarkmaster
Copy link
Author

I managed to make this ~2x faster by using Element.clear() and reducing the chunk size that I feed into XMLPullParser:

class XMLRpmParser(AbstractContextManager):
    '''
    Extracts RPM location, checksum, and size from -primary.xml.gz.  See the
    docblock of `SQLiteRpmParser` to learn why this parser is dispreferred,
    and why it exists anyway.
    '''
    
    def __init__(self):
        self.decompressor = zlib.decompressobj(wbits=zlib.MAX_WBITS + 16)
        self.xml_parser = ElementTree.XMLPullParser(['end'])
        # ElementTree mangles the tags thus: '{xml_namespace}tag_name'
        self.tag_re = re.compile('({[^}]+}|)(location|size|checksum|package)$')

    # This context manager does not suppress exceptions.
    def __exit__(self, exc_type, exc_val, exc_tb) -> None:
        # Closing the parser detects incomplete XML files. It also breaks
        # some circular refs to speed up GC.
        self.xml_parser.close()

    def feed(self, chunk: bytes) -> Iterator[Rpm]:
        location, size, checksum = None, None, None
        while chunk:
            # Consume the decompressed data in small chunks. This prevents
            # us from using unbounded amounts of RAM for decompression. 
            # More crucially, apparently XMLPullParser gets up to 50% slower
            # on package data if we feed it larger chuks.  This buffer size
            # was picked experimentally :)  
            #
            # NB: zlib appears to copy bytes into `unconsumed_tail` instead
            # of using something like `memoryview`, so this has poor
            # theoretical complexity due to all the extra copying.  I could
            # add an extra layer of input chunking to mitigate this, but in
            # practice it seems ok to just limit the incoming chunk size. 
            self.xml_parser.feed(self.decompressor.decompress(
                chunk, max_length=2 ** 14,
            ))
            chunk = self.decompressor.unconsumed_tail
            for event, elt  in self.xml_parser.read_events():
                m = self.tag_re.match(elt.tag)
                if m:
                    if m.group(2) == 'location':
                        location = elt.attrib['href']
                    elif m.group(2) == 'size':
                        size = elt.attrib['package']
                    elif m.group(2) == 'checksum':
                        assert elt.attrib['pkgid'] == 'YES'
                        checksum = elt.attrib['type'], elt.text
                    elif m.group(2) == 'package':
                        yield Rpm(
                            location=location,
                            size=int(size),
                            checksum_type=checksum[0],
                            checksum_value=checksum[1],
                        )
                        location, size, checksum = None, None, None
                        elt.clear()  # Uses less RAM, speeds up the run 50%

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment