Created August 26, 2022 20:44
Utility script to custom export WP blog posts via REST API including images and retaining image paths.
from requests import get
from bs4 import BeautifulSoup
from os import path, makedirs
from shutil import copyfileobj
from json import dumps
from time import sleep
# where we want to scrape
# the trailing / is needed
domain_name = ""
domain_root = ""
# found in the header of the response, I got it from postman
total_pages = 5
# attempt to add the trailing / if not found
if domain_name[-1] != '/':
domain_name = domain_name+'/'
# if file or path does not exist, create it
def ensure_dir(file_path):
directory = path.dirname(file_path)
if not path.exists(directory):
# download single image from server to local preserving paths
def downlaod_image(url, local_path):
# timeout hard-coded for 5 seconds
headers = {"User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36"}
r = get(url, headers=headers, stream=True, timeout=5)
with open(local_path, "wb") as f:
copyfileobj(r.raw, f)
# grab post data returned by the ?page=X endpoint
# domain is the root domain used in search-replace
# will be ran for each page
def get_posts(domain, data):
rtn = []
for d in data:
# dirty check if it has a featured image
featured_img_url = d['_embedded']['wp:featuredmedia'][0]['source_url']
featured_img_url = None
# dirty check if it has a category
category = d['_embedded']['wp:term'][0][0]['name']
category = None
# download featured image and keep path structure
featured_local_path = None
if featured_img_url:
# remove domain and wp path to images
# used as local path to keep file/folder structure
featured_local_path = featured_img_url.replace(domain+'wp-content/', '')
print('Downloading: ', featured_img_url)
downlaod_image(featured_img_url, featured_local_path)
# split out to vars for clarity
slug = d['slug']
post_date = d['date']
title = d['title']['rendered']
post_content = d['content']['rendered']
# grab any images inside the content itself
soup = BeautifulSoup(post_content, "html.parser")
post_images = soup.findAll('img')
for img in post_images:
img_url = img['src']
# don't care about images linked from other domains
if 'wp-content/uploads' not in img_url:
# run some search-replace on the urls
# do this in stages since some urls might not have www
img_local_path = img_url.replace('https://', '')
img_local_path = img_local_path.replace('www.', '')
img_local_path = img_local_path.replace(domain_root+'wp-content/', '')
print('Downloading: ', img_url)
downlaod_image(img_url, img_local_path)
# return a json obj
"title": title,
"slug": slug,
"category": category,
"post_date": post_date,
"featured_image": featured_local_path,
"post_content": post_content,
return rtn
def main():
base_endpoint = domain_name+"wp-json/wp/v2/posts/"
# set per_page for demoing pagination
start_url = base_endpoint+"?per_page=20&_embed&page="
current_page = 1
current_url = start_url+str(current_page)
all_posts = []
for p in range(total_pages):
current_page = p+1
current_url = start_url+str(current_page)
headers = {"User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36"}
resp = get(current_url, headers=headers)
data = resp.json()
posts_from_api_page = get_posts(domain_name, data)
# comment this out to go faster
with open('out.json', 'w') as outf:
if __name__=="__main__":
