Created
February 2, 2024 21:11
-
-
Save greg-randall/2365073937c73e5c0a1eea1e9c890617 to your computer and use it in GitHub Desktop.
Real basic way of splitting WordPress XML/WXR files. Set filename and number of items per split on lines 5 & 6.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import sys | |
input_file = 'wordpress_export_file.xml' | |
item_per_output_file = 500 | |
with open(input_file, 'r') as file: | |
content = file.read() | |
##get the header and body/footer split: | |
parts = content.split("<item>", 1) # Split the content at the first instance of "<item>" | |
header = parts[0] # The 'header' is everything before "<item>" | |
body_footer = f"<item> {parts[1]}" # The 'body' is everything after and including "<item>" | |
##body/footer split: | |
parts = body_footer.rsplit("</item>", 1)# Split the content at the last instance of "</item>" | |
body = f"{parts[0]} </item>" # The 'body' is everything before and including "</item>" | |
footer = parts[1]# The 'footer' is everything after "</item>" | |
#split the body into items | |
items = body.split("</item>")# Split the 'body' by "</item>" | |
if items[-1] == '': # Remove the last empty item if exists | |
items = items[:-1] | |
items = [item + "</item>" for item in items]# Append "</item>" to the end of each item | |
print(f"Input file split into {len(items)} items.") | |
# Initialize variables | |
output_count = 1 | |
grouped_items = [] | |
# Loop through the items | |
for i, item in enumerate(items, start=1): | |
grouped_items.append(item) | |
# If we've collected 500 items or reached the end of the list | |
if i % item_per_output_file == 0 or i == len(items): | |
# Open a new file | |
with open(f'output_{output_count}.xml', 'w') as f: | |
# Write the header, the grouped items, and the footer to the file | |
f.write(header) | |
f.write(''.join(grouped_items)) | |
f.write(footer) | |
# Prepare for the next file | |
output_count += 1 | |
grouped_items = [] | |
print(f"Created {output_count - 1} output files with {item_per_output_file} items per file.") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment