Skip to content

Instantly share code, notes, and snippets.

@greg-randall
Created February 2, 2024 21:11
Show Gist options
  • Save greg-randall/2365073937c73e5c0a1eea1e9c890617 to your computer and use it in GitHub Desktop.
Save greg-randall/2365073937c73e5c0a1eea1e9c890617 to your computer and use it in GitHub Desktop.
Real basic way of splitting WordPress XML/WXR files. Set filename and number of items per split on lines 5 & 6.
import os
import sys
input_file = 'wordpress_export_file.xml'
item_per_output_file = 500
with open(input_file, 'r') as file:
content = file.read()
##get the header and body/footer split:
parts = content.split("<item>", 1) # Split the content at the first instance of "<item>"
header = parts[0] # The 'header' is everything before "<item>"
body_footer = f"<item> {parts[1]}" # The 'body' is everything after and including "<item>"
##body/footer split:
parts = body_footer.rsplit("</item>", 1)# Split the content at the last instance of "</item>"
body = f"{parts[0]} </item>" # The 'body' is everything before and including "</item>"
footer = parts[1]# The 'footer' is everything after "</item>"
#split the body into items
items = body.split("</item>")# Split the 'body' by "</item>"
if items[-1] == '': # Remove the last empty item if exists
items = items[:-1]
items = [item + "</item>" for item in items]# Append "</item>" to the end of each item
print(f"Input file split into {len(items)} items.")
# Initialize variables
output_count = 1
grouped_items = []
# Loop through the items
for i, item in enumerate(items, start=1):
grouped_items.append(item)
# If we've collected 500 items or reached the end of the list
if i % item_per_output_file == 0 or i == len(items):
# Open a new file
with open(f'output_{output_count}.xml', 'w') as f:
# Write the header, the grouped items, and the footer to the file
f.write(header)
f.write(''.join(grouped_items))
f.write(footer)
# Prepare for the next file
output_count += 1
grouped_items = []
print(f"Created {output_count - 1} output files with {item_per_output_file} items per file.")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment