Skip to content

Instantly share code, notes, and snippets.

@Patrick-1994-
Last active June 20, 2023 00:21
Show Gist options
  • Save Patrick-1994-/acd41c084790aa9d07499b9a76245891 to your computer and use it in GitHub Desktop.
Save Patrick-1994-/acd41c084790aa9d07499b9a76245891 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python3
'''
Created on May 8, 2010 by @anasimtiaz
Original script URL: http://anasimtiaz.com/?p=51
That link is dead, I suppose this is current: https://anasimtiaz.com/2014/03/08/wordpress-xml-splitter-again/
Forked on May 28, 2016 by @danielwrobert
https://gist.github.com/danielwrobert/6c9ca8de8199d5430621f481673d4baa
Changed into a command-line script by @Patrick-1994- on Jan 24th 2022
VERSION 3.0
Added the -s to change the split size by @Patrick-1994- in 2023
(also --help and --version,so now I have to give this a version number^^)
'''
import os
import sys
import re
DEFAULT_SPLIT_SIZE = 500000
VERSION = "3.0"
#procInfo={};
def writeHeader(currentFile):
header = '''<?xml version="1.0" encoding="UTF-8"?>
<rss version="2.0"
xmlns:excerpt="http://wordpress.org/export/1.2/excerpt/"
xmlns:content="http://purl.org/rss/1.0/modules/content/"
xmlns:wfw="http://wellformedweb.org/CommentAPI/"
xmlns:dc="http://purl.org/dc/elements/1.1/"
xmlns:wp="http://wordpress.org/export/1.2/"
>
<channel>
<wp:wxr_version>1.2</wp:wxr_version>
'''
currentFile.write(header);
def writeFooter(currentFile):
footer = '''
</channel>
</rss>'''
currentFile.write(footer);
def errExit():
sys.exit(1);
def err_print(s):
print(s, file=sys.stderr)
def startProc(input_file_path, outDir, split_size):
filePath,fileName = os.path.split(input_file_path)
fileNameTxt = os.path.splitext(fileName)[0];
fileNameExt = os.path.splitext(fileName)[1];
xmlFileObj = open(os.path.join(filePath,fileName), "r");
xmlFile = xmlFileObj.read();
totalCount = len(xmlFile);
iteration = 0;
currentCount = 0;
EOF = False;
while(EOF==False):
currentFileName = fileNameTxt + "_" + str(iteration) + fileNameExt;
currentFile = open(os.path.join(outDir,currentFileName), 'w');
print( 'Writing file ' + currentFileName);
if iteration != 0:
writeHeader(currentFile);
if (currentCount+split_size) < totalCount:
# find index (?) of last </item> before split_size bytes are reached
xFile_i = xmlFile[currentCount:currentCount+split_size];
incrFile = xFile_i.rfind('</item>') + len('</item>');
currentFile.write(xFile_i[:incrFile]);
currentCount += incrFile;
else:
xFile_i = xmlFile[currentCount:];
currentFile.write(xFile_i);
print('Finished processing \n');
EOF = True;
if EOF != True:
writeFooter(currentFile);
iteration += 1;
def print_usage():
err_print(f"usage: [ -s SPLIT_SIZE ] {ARGV0} input_file_path [outDir]\noutDir is the current working directory by default")
def print_version():
print(VERSION)
if __name__ == '__main__':
split_size = DEFAULT_SPLIT_SIZE
ARGV0 = sys.argv[0]
command_line_args = sys.argv[1:]
if len(sys.argv) == 1: # no args
print_usage()
errExit()
if command_line_args[0] == "--help":
print_usage()
sys.exit(0)
if command_line_args[0] == "--version":
print_version()
sys.exit(0)
if command_line_args[0] == "-s":
if not command_line_args[1].isnumeric():
err_print( "-s given but the arg is not numeric" )
errExit()
split_size = int(command_line_args[1])
command_line_args = command_line_args[2:]
input_file_path = command_line_args[0]
if( len(command_line_args) == 2 ):
outDir = command_line_args[1]
else:
outDir = os.path.abspath( os.path.realpath(".") )
if not os.path.isfile(input_file_path):
err_print(f"""input file "{input_file_path}" does not exist""")
print_usage()
errExit()
if not os.path.isdir(outDir):
err_print(f"""output directory "{input_file_path}" does not exist""")
print_usage()
errExit()
startProc(input_file_path, outDir, split_size)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment