kbcarte/yoink.py Secret

## yoink.py
from requests import get
from bs4 import BeautifulSoup

from os import path, makedirs
from shutil import copyfileobj
from json import dumps
from time import sleep


# where we want to scrape
# the trailing / is needed
domain_name = "https://www.greatbigdigitalagency.com/"
domain_root = "greatbigdigitalagency.com/"


# found in the header of the response, I got it from postman
total_pages = 5


# attempt to add the trailing / if not found
if domain_name[-1] != '/':
    domain_name = domain_name+'/'


# https://stackoverflow.com/questions/273192/how-can-i-safely-create-a-nested-directory
# if file or path does not exist, create it
def ensure_dir(file_path):
    directory = path.dirname(file_path)
    if not path.exists(directory):
        makedirs(directory)


# download single image from server to local preserving paths
def downlaod_image(url, local_path):
    # timeout hard-coded for 5 seconds
    # https://miro.medium.com/max/1318/1*8xraf6eyaXh-myNXOXkqLA.jpeg
    headers = {"User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36"}
    r = get(url, headers=headers, stream=True, timeout=5)

    ensure_dir(local_path)
    with open(local_path, "wb") as f:
        copyfileobj(r.raw, f)


# grab post data returned by the ?page=X endpoint
# domain is the root domain used in search-replace
# will be ran for each page
def get_posts(domain, data):
    rtn = []
    for d in data:
        # dirty check if it has a featured image
        try:
            featured_img_url = d['_embedded']['wp:featuredmedia'][0]['source_url']
        except:
            featured_img_url = None

        # dirty check if it has a category
        try:
            category = d['_embedded']['wp:term'][0][0]['name']
        except:
            category = None

        # download featured image and keep path structure
        featured_local_path = None
        if featured_img_url:
            # remove domain and wp path to images
            # used as local path to keep file/folder structure
            featured_local_path = featured_img_url.replace(domain+'wp-content/', '')
            print('Downloading: ', featured_img_url)
            downlaod_image(featured_img_url, featured_local_path)

        # split out to vars for clarity
        slug = d['slug']
        post_date = d['date']
        title = d['title']['rendered']
        post_content = d['content']['rendered']

        # grab any images inside the content itself
        soup = BeautifulSoup(post_content, "html.parser")
        post_images = soup.findAll('img')
        for img in post_images:
            img_url = img['src']

            # don't care about images linked from other domains
            if 'wp-content/uploads' not in img_url:
                continue

            # run some search-replace on the urls
            # do this in stages since some urls might not have www
            img_local_path = img_url.replace('https://', '')
            img_local_path = img_local_path.replace('www.', '')
            img_local_path = img_local_path.replace(domain_root+'wp-content/', '')

            print('Downloading: ', img_url)
            downlaod_image(img_url, img_local_path)

        # return a json obj
        rtn.append({
            "title": title,
            "slug": slug,
            "category": category,
            "post_date": post_date,
            "featured_image": featured_local_path,
            "post_content": post_content,
        })

    return rtn


def main():
    base_endpoint = domain_name+"wp-json/wp/v2/posts/"
    # set per_page for demoing pagination
    start_url = base_endpoint+"?per_page=20&_embed&page="
    current_page = 1
    current_url = start_url+str(current_page)
    all_posts = []
    for p in range(total_pages):
        current_page = p+1
        current_url = start_url+str(current_page)
        print('-'*42)
        print(current_url)
        print('-'*42)

        headers = {"User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36"}
        resp = get(current_url, headers=headers)
        data = resp.json()

        posts_from_api_page = get_posts(domain_name, data)
        all_posts.extend(posts_from_api_page)

        # comment this out to go faster
        sleep(.5)

    with open('out.json', 'w') as outf:
        outf.write(dumps(all_posts))


if __name__=="__main__":
    main()
	from requests import get
	from bs4 import BeautifulSoup

	from os import path, makedirs
	from shutil import copyfileobj
	from json import dumps
	from time import sleep


	# where we want to scrape
	# the trailing / is needed
	domain_name = "https://www.greatbigdigitalagency.com/"
	domain_root = "greatbigdigitalagency.com/"


	# found in the header of the response, I got it from postman
	total_pages = 5


	# attempt to add the trailing / if not found
	if domain_name[-1] != '/':
	domain_name = domain_name+'/'


	# https://stackoverflow.com/questions/273192/how-can-i-safely-create-a-nested-directory
	# if file or path does not exist, create it
	def ensure_dir(file_path):
	directory = path.dirname(file_path)
	if not path.exists(directory):
	makedirs(directory)


	# download single image from server to local preserving paths
	def downlaod_image(url, local_path):
	# timeout hard-coded for 5 seconds
	# https://miro.medium.com/max/1318/1*8xraf6eyaXh-myNXOXkqLA.jpeg
	headers = {"User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36"}
	r = get(url, headers=headers, stream=True, timeout=5)

	ensure_dir(local_path)
	with open(local_path, "wb") as f:
	copyfileobj(r.raw, f)


	# grab post data returned by the ?page=X endpoint
	# domain is the root domain used in search-replace
	# will be ran for each page
	def get_posts(domain, data):
	rtn = []
	for d in data:
	# dirty check if it has a featured image
	try:
	featured_img_url = d['_embedded']['wp:featuredmedia'][0]['source_url']
	except:
	featured_img_url = None

	# dirty check if it has a category
	try:
	category = d['_embedded']['wp:term'][0][0]['name']
	except:
	category = None

	# download featured image and keep path structure
	featured_local_path = None
	if featured_img_url:
	# remove domain and wp path to images
	# used as local path to keep file/folder structure
	featured_local_path = featured_img_url.replace(domain+'wp-content/', '')
	print('Downloading: ', featured_img_url)
	downlaod_image(featured_img_url, featured_local_path)

	# split out to vars for clarity
	slug = d['slug']
	post_date = d['date']
	title = d['title']['rendered']
	post_content = d['content']['rendered']

	# grab any images inside the content itself
	soup = BeautifulSoup(post_content, "html.parser")
	post_images = soup.findAll('img')
	for img in post_images:
	img_url = img['src']

	# don't care about images linked from other domains
	if 'wp-content/uploads' not in img_url:
	continue

	# run some search-replace on the urls
	# do this in stages since some urls might not have www
	img_local_path = img_url.replace('https://', '')
	img_local_path = img_local_path.replace('www.', '')
	img_local_path = img_local_path.replace(domain_root+'wp-content/', '')

	print('Downloading: ', img_url)
	downlaod_image(img_url, img_local_path)

	# return a json obj
	rtn.append({
	"title": title,
	"slug": slug,
	"category": category,
	"post_date": post_date,
	"featured_image": featured_local_path,
	"post_content": post_content,
	})

	return rtn


	def main():
	base_endpoint = domain_name+"wp-json/wp/v2/posts/"
	# set per_page for demoing pagination
	start_url = base_endpoint+"?per_page=20&_embed&page="
	current_page = 1
	current_url = start_url+str(current_page)
	all_posts = []
	for p in range(total_pages):
	current_page = p+1
	current_url = start_url+str(current_page)
	print('-'*42)
	print(current_url)
	print('-'*42)

	headers = {"User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36"}
	resp = get(current_url, headers=headers)
	data = resp.json()

	posts_from_api_page = get_posts(domain_name, data)
	all_posts.extend(posts_from_api_page)

	# comment this out to go faster
	sleep(.5)

	with open('out.json', 'w') as outf:
	outf.write(dumps(all_posts))


	if __name__=="__main__":
	main()