mlt/pg-split-dump.py

## pg-split-dump.py
#!/usr/bin/env python
"""
Split a PostgreSQL plain dump file into files for each of its objects.

This is used to compare two SQL database contents: both contents are dumped to a
plain text file, and they are then processed through this script which generates
a file for each database object, which can then be compared using a directory
diff program.
"""

# stdlib imports
import os, re
from os.path import join, exists
from string import maketrans


# Note: this code is pulled straight out of the xxdiff library.
# Copied here to avoid the dependency.
sec_re = re.compile('^--\n-- (?:Data for )?Name:\s+(?:DOMAIN|OPERATOR|SCHEMA|VIEW|COLUMN|EXTENSION|TABLE|FUNCTION|SEQUENCE)?\s*([^(;]+).*;\s*Type:\s+([^;]+);\s*Schema:\s+([^;]+);.*$' , re.M)

def parse_dump(dbdump, sort_columns=False):
    """
    Parse a PostgreSQL database dump, extracting all its section into a list of
    (name, type, contents) tuples. The entries are built from the Name and Type
    fields in the header comments. Contents of data chunks for tables are
    returned as type 'DATA', so if you're going to turn this into a map, be
    careful to filter out the 'data' entries or concatenate them to the schema
    (whichever is appropriate for your application).
    """
    # Class to contain info about chunks.
    class Chunk:
        def __init__(self, mo):
            self.mo = mo
            self.name, self.typ, self.nspname = mo.group(1, 2, 3)

    # Parse chunks.
    chunks = map(lambda mo: Chunk(mo), sec_re.finditer(dbdump))
    for c1, c2 in consepairs(chunks):
        c1.contents = dbdump[c1.mo.end():c2.mo.start()]
    chunks[-1].contents = dbdump[chunks[-1].mo.end():]

    for c in chunks:
        c.descline = c.mo.group(0)
        c.mo = None # release the match objects

    return list((c.name, c.typ.replace(' ', '_').lower(), c.nspname,
                 c.descline + c.contents if c.contents else None) for c in chunks)

# Note: this code is pulled straight out of the xxdiff library.
# Copied here to avoid the dependency.
def consepairs(seq):
    """
    Iterate over consecutive pairs.
    """
    it = iter(seq)
    prev = it.next()
    for el in it:
        if prev.name == el.name:
            el.contents = None
            continue
        yield prev, el
        prev = el

def main():
    import optparse
    parser = optparse.OptionParser(__doc__.strip())
    opts, args = parser.parse_args()

    if len(args) not in (1, 2):
        parser.error("You must specify a single dumpfile, "
                     "and optionally an output directory.")
    fn = args[0]
    rootdir = args[1] if len(args) == 2 else os.getcwd()

    if not exists(rootdir):
        os.makedirs(rootdir)

    contents = parse_dump(open(fn).read())
    for name, type, nspname, contents in contents:
        if not contents:
            continue
        outdir = join(rootdir, type, nspname)
        if not exists(outdir):
            os.makedirs(outdir)
        fn = '{name}.sql'.format(name=name.translate(maketrans('[]', '__'), '"'))
        print fn
        f = open(join(outdir, fn), 'w')
        f.write(contents.rstrip() + '\n')
        f.close()

if __name__ == '__main__':
    main()
	#!/usr/bin/env python
	"""
	Split a PostgreSQL plain dump file into files for each of its objects.

	This is used to compare two SQL database contents: both contents are dumped to a
	plain text file, and they are then processed through this script which generates
	a file for each database object, which can then be compared using a directory
	diff program.
	"""

	# stdlib imports
	import os, re
	from os.path import join, exists
	from string import maketrans


	# Note: this code is pulled straight out of the xxdiff library.
	# Copied here to avoid the dependency.
	sec_re = re.compile('^--\n-- (?:Data for )?Name:\s+(?:DOMAIN\|OPERATOR\|SCHEMA\|VIEW\|COLUMN\|EXTENSION\|TABLE\|FUNCTION\|SEQUENCE)?\s([^(;]+).;\sType:\s+([^;]+);\sSchema:\s+([^;]+);.*$' , re.M)

	def parse_dump(dbdump, sort_columns=False):
	"""
	Parse a PostgreSQL database dump, extracting all its section into a list of
	(name, type, contents) tuples. The entries are built from the Name and Type
	fields in the header comments. Contents of data chunks for tables are
	returned as type 'DATA', so if you're going to turn this into a map, be
	careful to filter out the 'data' entries or concatenate them to the schema
	(whichever is appropriate for your application).
	"""
	# Class to contain info about chunks.
	class Chunk:
	def __init__(self, mo):
	self.mo = mo
	self.name, self.typ, self.nspname = mo.group(1, 2, 3)

	# Parse chunks.
	chunks = map(lambda mo: Chunk(mo), sec_re.finditer(dbdump))
	for c1, c2 in consepairs(chunks):
	c1.contents = dbdump[c1.mo.end():c2.mo.start()]
	chunks[-1].contents = dbdump[chunks[-1].mo.end():]

	for c in chunks:
	c.descline = c.mo.group(0)
	c.mo = None # release the match objects

	return list((c.name, c.typ.replace(' ', '_').lower(), c.nspname,
	c.descline + c.contents if c.contents else None) for c in chunks)

	# Note: this code is pulled straight out of the xxdiff library.
	# Copied here to avoid the dependency.
	def consepairs(seq):
	"""
	Iterate over consecutive pairs.
	"""
	it = iter(seq)
	prev = it.next()
	for el in it:
	if prev.name == el.name:
	el.contents = None
	continue
	yield prev, el
	prev = el

	def main():
	import optparse
	parser = optparse.OptionParser(__doc__.strip())
	opts, args = parser.parse_args()

	if len(args) not in (1, 2):
	parser.error("You must specify a single dumpfile, "
	"and optionally an output directory.")
	fn = args[0]
	rootdir = args[1] if len(args) == 2 else os.getcwd()

	if not exists(rootdir):
	os.makedirs(rootdir)

	contents = parse_dump(open(fn).read())
	for name, type, nspname, contents in contents:
	if not contents:
	continue
	outdir = join(rootdir, type, nspname)
	if not exists(outdir):
	os.makedirs(outdir)
	fn = '{name}.sql'.format(name=name.translate(maketrans('[]', '__'), '"'))
	print fn
	f = open(join(outdir, fn), 'w')
	f.write(contents.rstrip() + '\n')
	f.close()

	if __name__ == '__main__':
	main()