Created
April 22, 2016 12:14
-
-
Save borowis/e2d47541d554fd5a768800d9a6cb0fe6 to your computer and use it in GitHub Desktop.
Split text/xml files on regexp, file_size. Similar to some hybrid of csplit + split. Usage: ./file_split.py -M 8092 --regex ""^<tu>\s*$" big.xml --> split potentially huge big.xml into approximately 8 Mb chunks with names big.0.xml, big.1.xml etc. Split on lines matching regular expression provided. Inital idea from https://gist.github.com/benal…
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
import os | |
import re | |
import codecs | |
from optparse import OptionParser | |
from math import log10 | |
# default encoding to use | |
ENCODING = "utf-8" | |
# From how much should we start another file | |
MAX_SIZE = 1024*1024 # 1Mb | |
def main(filename, output_dir, split_regexp): | |
cur_idx = 0 | |
fmt = ".%%0%dd" % (int(log10(os.path.getsize(filename) / MAX_SIZE)) + 1) | |
out_dir, filename = os.path.split(filename) | |
if output_dir is not None: | |
out_dir = output_dir | |
root, ext = os.path.splitext(filename) | |
cur_file = next_file(out_dir, root + fmt % cur_idx + ext) | |
bytesread = 0 | |
with codecs.open(filename, 'rt', ENCODING) as original_file: | |
for line in original_file: | |
bytesread += len(line) | |
if bytesread > MAX_SIZE and split_regexp.match(line): | |
cur_file.close() | |
bytesread = 0 | |
print "part %d done" % cur_idx | |
cur_idx += 1 | |
cur_file = next_file(out_dir, root + fmt % cur_idx + ext) | |
cur_file.write(line) | |
# Don't forget to close our handle | |
cur_file.close() | |
print "part %d done" % cur_idx | |
def next_file(output_dir, file_name): | |
cur_file = codecs.open(os.path.join(output_dir, file_name), 'wt', ENCODING) | |
return cur_file | |
if __name__ == "__main__": | |
parser = OptionParser(usage="usage: %prog [options] XML_FILE") | |
parser.add_option("-o", "--output-dir", | |
help="Specify the directory where the files will be written" \ | |
"(default to the same directory where the original file is)") | |
parser.add_option("-M", "--max_size", type="int", | |
help="Specify the size at which the files should be split (in Kb)") | |
parser.add_option("--regexp", help="Split on regexp") | |
parser.add_option("--encoding", help="Encoding to use. Default to UTF-8") | |
(options, args) = parser.parse_args() | |
if len(args) != 1: | |
parser.error("incorrect number of arguments") | |
if options.max_size is not None: | |
MAX_SIZE = options.max_size * 1024 | |
print "Requested size = %s (bytes)" % MAX_SIZE | |
if options.regexp is None: | |
parser.error("Parameter 'regexp' is required") | |
else: | |
split_regexp = re.compile(options.regexp) | |
if options.encoding is not None: | |
ENCODING = options.encoding | |
main(args[0], options.output_dir, split_regexp) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
I needed to split huge xml files into smaller files but only on a specific tag. Original idea was to use https://gist.github.com/benallard/8042835, but unfortunately it used SAX parser to process xml files and I was not getting exact same entities back (for instance, I had " in original file but it came back as " which is no big deal actually if we are not talking attributes but the problem was I couldn't verify the split as byte counts differed considerably). So I created 'dumb' splitter based on that code which works similar to csplit in that it uses regexp to find a place to split on and similar to split in that it splits by file size.