greg-randall/splittums.py

## splittums.py
import os
import sys


input_file = 'wordpress_export_file.xml'
item_per_output_file = 500


with open(input_file, 'r') as file:
    content = file.read()

##get the header and body/footer split:
parts = content.split("<item>", 1) # Split the content at the first instance of "<item>"
header = parts[0] # The 'header' is everything before "<item>"
body_footer = f"<item> {parts[1]}" # The 'body' is everything after and including "<item>"


##body/footer split:
parts = body_footer.rsplit("</item>", 1)# Split the content at the last instance of "</item>"
body = f"{parts[0]} </item>" # The 'body' is everything before and including "</item>"
footer = parts[1]# The 'footer' is everything after "</item>"


#split the body into items
items = body.split("</item>")# Split the 'body' by "</item>"
if items[-1] == '': # Remove the last empty item if exists
    items = items[:-1]
items = [item + "</item>" for item in items]# Append "</item>" to the end of each item


print(f"Input file split into {len(items)} items.")


# Initialize variables
output_count = 1
grouped_items = []

# Loop through the items
for i, item in enumerate(items, start=1):
    grouped_items.append(item)

    # If we've collected 500 items or reached the end of the list
    if i % item_per_output_file == 0 or i == len(items):
        # Open a new file
        with open(f'output_{output_count}.xml', 'w') as f:
            # Write the header, the grouped items, and the footer to the file
            f.write(header)
            f.write(''.join(grouped_items))
            f.write(footer)

        # Prepare for the next file
        output_count += 1
        grouped_items = []

print(f"Created {output_count - 1} output files with {item_per_output_file} items per file.")
	import os
	import sys


	input_file = 'wordpress_export_file.xml'
	item_per_output_file = 500


	with open(input_file, 'r') as file:
	content = file.read()

	##get the header and body/footer split:
	parts = content.split("<item>", 1) # Split the content at the first instance of "<item>"
	header = parts[0] # The 'header' is everything before "<item>"
	body_footer = f"<item> {parts[1]}" # The 'body' is everything after and including "<item>"


	##body/footer split:
	parts = body_footer.rsplit("</item>", 1)# Split the content at the last instance of "</item>"
	body = f"{parts[0]} </item>" # The 'body' is everything before and including "</item>"
	footer = parts[1]# The 'footer' is everything after "</item>"


	#split the body into items
	items = body.split("</item>")# Split the 'body' by "</item>"
	if items[-1] == '': # Remove the last empty item if exists
	items = items[:-1]
	items = [item + "</item>" for item in items]# Append "</item>" to the end of each item


	print(f"Input file split into {len(items)} items.")


	# Initialize variables
	output_count = 1
	grouped_items = []

	# Loop through the items
	for i, item in enumerate(items, start=1):
	grouped_items.append(item)

	# If we've collected 500 items or reached the end of the list
	if i % item_per_output_file == 0 or i == len(items):
	# Open a new file
	with open(f'output_{output_count}.xml', 'w') as f:
	# Write the header, the grouped items, and the footer to the file
	f.write(header)
	f.write(''.join(grouped_items))
	f.write(footer)

	# Prepare for the next file
	output_count += 1
	grouped_items = []

	print(f"Created {output_count - 1} output files with {item_per_output_file} items per file.")