-
-
Save kbcarte/cbf5da0a47e1f0acc34689971e5c034f to your computer and use it in GitHub Desktop.
Utility script to custom export WP blog posts via REST API including images and retaining image paths.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from requests import get | |
from bs4 import BeautifulSoup | |
from os import path, makedirs | |
from shutil import copyfileobj | |
from json import dumps | |
from time import sleep | |
# where we want to scrape | |
# the trailing / is needed | |
domain_name = "https://www.greatbigdigitalagency.com/" | |
domain_root = "greatbigdigitalagency.com/" | |
# found in the header of the response, I got it from postman | |
total_pages = 5 | |
# attempt to add the trailing / if not found | |
if domain_name[-1] != '/': | |
domain_name = domain_name+'/' | |
# https://stackoverflow.com/questions/273192/how-can-i-safely-create-a-nested-directory | |
# if file or path does not exist, create it | |
def ensure_dir(file_path): | |
directory = path.dirname(file_path) | |
if not path.exists(directory): | |
makedirs(directory) | |
# download single image from server to local preserving paths | |
def downlaod_image(url, local_path): | |
# timeout hard-coded for 5 seconds | |
# https://miro.medium.com/max/1318/1*8xraf6eyaXh-myNXOXkqLA.jpeg | |
headers = {"User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36"} | |
r = get(url, headers=headers, stream=True, timeout=5) | |
ensure_dir(local_path) | |
with open(local_path, "wb") as f: | |
copyfileobj(r.raw, f) | |
# grab post data returned by the ?page=X endpoint | |
# domain is the root domain used in search-replace | |
# will be ran for each page | |
def get_posts(domain, data): | |
rtn = [] | |
for d in data: | |
# dirty check if it has a featured image | |
try: | |
featured_img_url = d['_embedded']['wp:featuredmedia'][0]['source_url'] | |
except: | |
featured_img_url = None | |
# dirty check if it has a category | |
try: | |
category = d['_embedded']['wp:term'][0][0]['name'] | |
except: | |
category = None | |
# download featured image and keep path structure | |
featured_local_path = None | |
if featured_img_url: | |
# remove domain and wp path to images | |
# used as local path to keep file/folder structure | |
featured_local_path = featured_img_url.replace(domain+'wp-content/', '') | |
print('Downloading: ', featured_img_url) | |
downlaod_image(featured_img_url, featured_local_path) | |
# split out to vars for clarity | |
slug = d['slug'] | |
post_date = d['date'] | |
title = d['title']['rendered'] | |
post_content = d['content']['rendered'] | |
# grab any images inside the content itself | |
soup = BeautifulSoup(post_content, "html.parser") | |
post_images = soup.findAll('img') | |
for img in post_images: | |
img_url = img['src'] | |
# don't care about images linked from other domains | |
if 'wp-content/uploads' not in img_url: | |
continue | |
# run some search-replace on the urls | |
# do this in stages since some urls might not have www | |
img_local_path = img_url.replace('https://', '') | |
img_local_path = img_local_path.replace('www.', '') | |
img_local_path = img_local_path.replace(domain_root+'wp-content/', '') | |
print('Downloading: ', img_url) | |
downlaod_image(img_url, img_local_path) | |
# return a json obj | |
rtn.append({ | |
"title": title, | |
"slug": slug, | |
"category": category, | |
"post_date": post_date, | |
"featured_image": featured_local_path, | |
"post_content": post_content, | |
}) | |
return rtn | |
def main(): | |
base_endpoint = domain_name+"wp-json/wp/v2/posts/" | |
# set per_page for demoing pagination | |
start_url = base_endpoint+"?per_page=20&_embed&page=" | |
current_page = 1 | |
current_url = start_url+str(current_page) | |
all_posts = [] | |
for p in range(total_pages): | |
current_page = p+1 | |
current_url = start_url+str(current_page) | |
print('-'*42) | |
print(current_url) | |
print('-'*42) | |
headers = {"User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36"} | |
resp = get(current_url, headers=headers) | |
data = resp.json() | |
posts_from_api_page = get_posts(domain_name, data) | |
all_posts.extend(posts_from_api_page) | |
# comment this out to go faster | |
sleep(.5) | |
with open('out.json', 'w') as outf: | |
outf.write(dumps(all_posts)) | |
if __name__=="__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment