Skip to content

Instantly share code, notes, and snippets.

@kbcarte

kbcarte/yoink.py Secret

Created August 26, 2022 20:44
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save kbcarte/cbf5da0a47e1f0acc34689971e5c034f to your computer and use it in GitHub Desktop.
Save kbcarte/cbf5da0a47e1f0acc34689971e5c034f to your computer and use it in GitHub Desktop.
Utility script to custom export WP blog posts via REST API including images and retaining image paths.
from requests import get
from bs4 import BeautifulSoup
from os import path, makedirs
from shutil import copyfileobj
from json import dumps
from time import sleep
# where we want to scrape
# the trailing / is needed
domain_name = "https://www.greatbigdigitalagency.com/"
domain_root = "greatbigdigitalagency.com/"
# found in the header of the response, I got it from postman
total_pages = 5
# attempt to add the trailing / if not found
if domain_name[-1] != '/':
domain_name = domain_name+'/'
# https://stackoverflow.com/questions/273192/how-can-i-safely-create-a-nested-directory
# if file or path does not exist, create it
def ensure_dir(file_path):
directory = path.dirname(file_path)
if not path.exists(directory):
makedirs(directory)
# download single image from server to local preserving paths
def downlaod_image(url, local_path):
# timeout hard-coded for 5 seconds
# https://miro.medium.com/max/1318/1*8xraf6eyaXh-myNXOXkqLA.jpeg
headers = {"User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36"}
r = get(url, headers=headers, stream=True, timeout=5)
ensure_dir(local_path)
with open(local_path, "wb") as f:
copyfileobj(r.raw, f)
# grab post data returned by the ?page=X endpoint
# domain is the root domain used in search-replace
# will be ran for each page
def get_posts(domain, data):
rtn = []
for d in data:
# dirty check if it has a featured image
try:
featured_img_url = d['_embedded']['wp:featuredmedia'][0]['source_url']
except:
featured_img_url = None
# dirty check if it has a category
try:
category = d['_embedded']['wp:term'][0][0]['name']
except:
category = None
# download featured image and keep path structure
featured_local_path = None
if featured_img_url:
# remove domain and wp path to images
# used as local path to keep file/folder structure
featured_local_path = featured_img_url.replace(domain+'wp-content/', '')
print('Downloading: ', featured_img_url)
downlaod_image(featured_img_url, featured_local_path)
# split out to vars for clarity
slug = d['slug']
post_date = d['date']
title = d['title']['rendered']
post_content = d['content']['rendered']
# grab any images inside the content itself
soup = BeautifulSoup(post_content, "html.parser")
post_images = soup.findAll('img')
for img in post_images:
img_url = img['src']
# don't care about images linked from other domains
if 'wp-content/uploads' not in img_url:
continue
# run some search-replace on the urls
# do this in stages since some urls might not have www
img_local_path = img_url.replace('https://', '')
img_local_path = img_local_path.replace('www.', '')
img_local_path = img_local_path.replace(domain_root+'wp-content/', '')
print('Downloading: ', img_url)
downlaod_image(img_url, img_local_path)
# return a json obj
rtn.append({
"title": title,
"slug": slug,
"category": category,
"post_date": post_date,
"featured_image": featured_local_path,
"post_content": post_content,
})
return rtn
def main():
base_endpoint = domain_name+"wp-json/wp/v2/posts/"
# set per_page for demoing pagination
start_url = base_endpoint+"?per_page=20&_embed&page="
current_page = 1
current_url = start_url+str(current_page)
all_posts = []
for p in range(total_pages):
current_page = p+1
current_url = start_url+str(current_page)
print('-'*42)
print(current_url)
print('-'*42)
headers = {"User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36"}
resp = get(current_url, headers=headers)
data = resp.json()
posts_from_api_page = get_posts(domain_name, data)
all_posts.extend(posts_from_api_page)
# comment this out to go faster
sleep(.5)
with open('out.json', 'w') as outf:
outf.write(dumps(all_posts))
if __name__=="__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment