borowis/file_split.py

## file_split.py
#!/usr/bin/env python

import os
import re
import codecs
from optparse import OptionParser
from math import log10

# default encoding to use
ENCODING = "utf-8"

# From how much should we start another file
MAX_SIZE = 1024*1024 # 1Mb

def main(filename, output_dir, split_regexp):
    cur_idx = 0
    fmt = ".%%0%dd" % (int(log10(os.path.getsize(filename) / MAX_SIZE)) + 1)

    out_dir, filename = os.path.split(filename)
    if output_dir is not None:
        out_dir = output_dir

    root, ext = os.path.splitext(filename)
    cur_file = next_file(out_dir, root + fmt % cur_idx + ext)

    bytesread = 0
    with codecs.open(filename, 'rt', ENCODING) as original_file:
        for line in original_file:
            bytesread += len(line)
            if bytesread > MAX_SIZE and split_regexp.match(line):
                cur_file.close()
                bytesread = 0
                print "part %d done" % cur_idx

                cur_idx += 1
                cur_file = next_file(out_dir, root + fmt % cur_idx + ext)

            cur_file.write(line)

    # Don't forget to close our handle
    cur_file.close()

    print "part %d done" % cur_idx

def next_file(output_dir, file_name):
    cur_file = codecs.open(os.path.join(output_dir, file_name), 'wt', ENCODING)
    return cur_file

if __name__ == "__main__":
    parser = OptionParser(usage="usage: %prog [options] XML_FILE")
    parser.add_option("-o", "--output-dir",
        help="Specify the directory where the files will be written" \
            "(default to the same directory where the original file is)")
    parser.add_option("-M", "--max_size", type="int",
        help="Specify the size at which the files should be split (in Kb)")
    parser.add_option("--regexp", help="Split on regexp")
    parser.add_option("--encoding", help="Encoding to use. Default to UTF-8")
    (options, args) = parser.parse_args()
    if len(args) != 1:
        parser.error("incorrect number of arguments")
    if options.max_size is not None:
        MAX_SIZE = options.max_size * 1024
        print "Requested size = %s (bytes)" % MAX_SIZE
    if options.regexp is None:
        parser.error("Parameter 'regexp' is required")
    else:
        split_regexp = re.compile(options.regexp)
    if options.encoding is not None:
        ENCODING = options.encoding

    main(args[0], options.output_dir, split_regexp)
	#!/usr/bin/env python

	import os
	import re
	import codecs
	from optparse import OptionParser
	from math import log10

	# default encoding to use
	ENCODING = "utf-8"

	# From how much should we start another file
	MAX_SIZE = 1024*1024 # 1Mb

	def main(filename, output_dir, split_regexp):
	cur_idx = 0
	fmt = ".%%0%dd" % (int(log10(os.path.getsize(filename) / MAX_SIZE)) + 1)

	out_dir, filename = os.path.split(filename)
	if output_dir is not None:
	out_dir = output_dir

	root, ext = os.path.splitext(filename)
	cur_file = next_file(out_dir, root + fmt % cur_idx + ext)

	bytesread = 0
	with codecs.open(filename, 'rt', ENCODING) as original_file:
	for line in original_file:
	bytesread += len(line)
	if bytesread > MAX_SIZE and split_regexp.match(line):
	cur_file.close()
	bytesread = 0
	print "part %d done" % cur_idx

	cur_idx += 1
	cur_file = next_file(out_dir, root + fmt % cur_idx + ext)

	cur_file.write(line)

	# Don't forget to close our handle
	cur_file.close()

	print "part %d done" % cur_idx

	def next_file(output_dir, file_name):
	cur_file = codecs.open(os.path.join(output_dir, file_name), 'wt', ENCODING)
	return cur_file

	if __name__ == "__main__":
	parser = OptionParser(usage="usage: %prog [options] XML_FILE")
	parser.add_option("-o", "--output-dir",
	help="Specify the directory where the files will be written" \
	"(default to the same directory where the original file is)")
	parser.add_option("-M", "--max_size", type="int",
	help="Specify the size at which the files should be split (in Kb)")
	parser.add_option("--regexp", help="Split on regexp")
	parser.add_option("--encoding", help="Encoding to use. Default to UTF-8")
	(options, args) = parser.parse_args()
	if len(args) != 1:
	parser.error("incorrect number of arguments")
	if options.max_size is not None:
	MAX_SIZE = options.max_size * 1024
	print "Requested size = %s (bytes)" % MAX_SIZE
	if options.regexp is None:
	parser.error("Parameter 'regexp' is required")
	else:
	split_regexp = re.compile(options.regexp)
	if options.encoding is not None:
	ENCODING = options.encoding

	main(args[0], options.output_dir, split_regexp)