briglx/slurpBlogger.py

## slurpBlogger.py
#!/usr/bin/env python3
"""Slurp all Images and text from the blog into a folder."""
from urllib.request import urlopen, urlretrieve
from bs4 import BeautifulSoup
from datetime import datetime
import os
import re
import os.path


def getPostsByYear(blogUrl, year, month):
    """Get url for each post by month and year return as list.

    Defaults to returning empty list if no posts are found.
    """
    url = blogUrl + str(year) + '/' + str(month).zfill(2)
    f = urlopen(url)
    soup = BeautifulSoup(f, 'html.parser')

    archiveList = soup.find(attrs={'id': 'BlogArchive1_ArchiveList'})

    uls = archiveList.findChildren('ul', recursive=False)

    pattern = re.compile(
        r"^(\w|\:|\/|\.)+" +
        str(year) + "/" +
        str(month).zfill(2) +
        r"/(\w|\-)+"
    )

    for ul in uls:

        children = ul.findChildren(recursive=False)
        for child in children:
            a = child.find(
                'a',
                attrs={'class', 'post-count-link'},
                recursive=False
            )

            linkYear = int(a.text.strip())
            if(linkYear == year):
                # ml = child.find(
                #     'a',
                #     attrs={'class', 'post-count-link'},
                #     recursive=False
                # )

                monthLinks = ul.find_all('a')
                elinks = [ml.get('href') for ml in monthLinks
                          if pattern.match(ml.get('href'))]

    return elinks


def getPostInfo(url):
    """Get post text and list of images and return as tuple.

    Defaults to returning empty tuple if post is not found.
    """
    try:

        f = urlopen(url)

        soup = BeautifulSoup(f, 'html.parser')

        postDateStr = soup.find(attrs={'class': 'date-header'}).text.strip()
        postDateTime = datetime.strptime(postDateStr, '%A, %B %d, %Y')
        postDate = postDateTime.strftime("%Y%m%d")

        postTitle = soup.find(attrs={'class': 'post-title'}).text.strip()
        cleanPostTitle = ''.join(
            e for e in postTitle if e.isalnum() or e == ' ')

        postBody = soup.find(attrs={'class': 'post-body'})

        # lines = postBody.text.strip().splitlines()

        postBodyClean = "\n\n".join(
            [s.strip() for s in postBody.text.strip().splitlines() if s])

        postImages = postBody.find_all('img')

        postBodyClean = postBodyClean + "\n\n"
        for img in postImages:
            imgNameParts = img.get('src').split('/')
            postBodyClean = postBodyClean + imgNameParts[-1] + "\n"

        return (postDate + "-" + cleanPostTitle + ".txt", postBodyClean)

    except ConnectionResetError:

        print("Connection closed .. .try again.")
        return True, "Connection Closed for " + url


def savePostInfo(post):
    """Save post data to a file."""
    subdirectory = post[0].split('.')[0]
    fileName = post[0]
    postText = post[1]

    folderName = os.path.join("Posts", subdirectory)

    try:
        print("Making folder: " + folderName)
        os.mkdir(folderName)
    except Exception:
        print("Failed to make folder " + folderName)
        pass

    try:
        fullFileName = os.path.join(folderName, fileName)
        with open(
                fullFileName,
                'w', encoding='utf8'
        ) as file:
            file.write(postText)
    except Exception:
        print("Failed to save file " + fullFileName)
        print(Exception)


def savePostImages(url):
    """Download post images and save to folder."""
    try:
        f = urlopen(url)

        soup = BeautifulSoup(f, 'html.parser')

        postDateStr = soup.find(attrs={'class': 'date-header'}).text.strip()
        postDateTime = datetime.strptime(postDateStr, '%A, %B %d, %Y')
        postDate = postDateTime.strftime("%Y%m%d")

        postTitle = soup.find(attrs={'class': 'post-title'}).text.strip()
        cleanPostTitle = ''.join(
            e for e in postTitle if e.isalnum() or e == ' ')
        subDirectory = postDate + "-" + cleanPostTitle

        postBody = soup.find(attrs={'class': 'post-body'})
        postImages = postBody.find_all('img')

        try:
            os.mkdir(os.path.join("Posts", subDirectory))
        except Exception:
            pass

        for img in postImages:
            imgParts = img.get('src').split('/')
            imgParts[-2] = 's2400'

            imgUrl = '/'.join(imgParts)

            try:
                urlretrieve(imgUrl, os.path.join(
                    "Posts", subDirectory, os.path.basename(imgUrl)))
            except Exception:
                print("Failed to download for " + subDirectory + " " + imgUrl)

    except ConnectionResetError:

        print("Connection closed .. .try again.")
        return True, "Connection Closed for " + url


def slurpBlog(blogUrl, year, month):
    """Fetch text and images for a given blog year and month.

    Creates a post folder with a subfolder for each post.
    """
    postLinks = getPostsByYear(blogUrl, year, month)

    folderName = os.path.join("Posts")
    try:
        print("Making folder: " + folderName)
        os.mkdir(folderName)
    except Exception:
        print("Failed to make folder " + folderName)
        pass

    for postLink in postLinks:
        post = getPostInfo(postLink)
        savePostInfo(post)
        savePostImages(postLink)


def main():
    """Run the script."""
    blogUrl = 'http://example.blogspot.com/'
    year = 2018

    for i in range(12):
        slurpBlog(blogUrl, year, i+1)


if __name__ == '__main__':
    main()
	#!/usr/bin/env python3
	"""Slurp all Images and text from the blog into a folder."""
	from urllib.request import urlopen, urlretrieve
	from bs4 import BeautifulSoup
	from datetime import datetime
	import os
	import re
	import os.path


	def getPostsByYear(blogUrl, year, month):
	"""Get url for each post by month and year return as list.

	Defaults to returning empty list if no posts are found.
	"""
	url = blogUrl + str(year) + '/' + str(month).zfill(2)
	f = urlopen(url)
	soup = BeautifulSoup(f, 'html.parser')

	archiveList = soup.find(attrs={'id': 'BlogArchive1_ArchiveList'})

	uls = archiveList.findChildren('ul', recursive=False)

	pattern = re.compile(
	r"^(\w\|\:\|\/\|\.)+" +
	str(year) + "/" +
	str(month).zfill(2) +
	r"/(\w\|\-)+"
	)

	for ul in uls:

	children = ul.findChildren(recursive=False)
	for child in children:
	a = child.find(
	'a',
	attrs={'class', 'post-count-link'},
	recursive=False
	)

	linkYear = int(a.text.strip())
	if(linkYear == year):
	# ml = child.find(
	# 'a',
	# attrs={'class', 'post-count-link'},
	# recursive=False
	# )

	monthLinks = ul.find_all('a')
	elinks = [ml.get('href') for ml in monthLinks
	if pattern.match(ml.get('href'))]

	return elinks


	def getPostInfo(url):
	"""Get post text and list of images and return as tuple.

	Defaults to returning empty tuple if post is not found.
	"""
	try:

	f = urlopen(url)

	soup = BeautifulSoup(f, 'html.parser')

	postDateStr = soup.find(attrs={'class': 'date-header'}).text.strip()
	postDateTime = datetime.strptime(postDateStr, '%A, %B %d, %Y')
	postDate = postDateTime.strftime("%Y%m%d")

	postTitle = soup.find(attrs={'class': 'post-title'}).text.strip()
	cleanPostTitle = ''.join(
	e for e in postTitle if e.isalnum() or e == ' ')

	postBody = soup.find(attrs={'class': 'post-body'})

	# lines = postBody.text.strip().splitlines()

	postBodyClean = "\n\n".join(
	[s.strip() for s in postBody.text.strip().splitlines() if s])

	postImages = postBody.find_all('img')

	postBodyClean = postBodyClean + "\n\n"
	for img in postImages:
	imgNameParts = img.get('src').split('/')
	postBodyClean = postBodyClean + imgNameParts[-1] + "\n"

	return (postDate + "-" + cleanPostTitle + ".txt", postBodyClean)

	except ConnectionResetError:

	print("Connection closed .. .try again.")
	return True, "Connection Closed for " + url


	def savePostInfo(post):
	"""Save post data to a file."""
	subdirectory = post[0].split('.')[0]
	fileName = post[0]
	postText = post[1]

	folderName = os.path.join("Posts", subdirectory)

	try:
	print("Making folder: " + folderName)
	os.mkdir(folderName)
	except Exception:
	print("Failed to make folder " + folderName)
	pass

	try:
	fullFileName = os.path.join(folderName, fileName)
	with open(
	fullFileName,
	'w', encoding='utf8'
	) as file:
	file.write(postText)
	except Exception:
	print("Failed to save file " + fullFileName)
	print(Exception)


	def savePostImages(url):
	"""Download post images and save to folder."""
	try:
	f = urlopen(url)

	soup = BeautifulSoup(f, 'html.parser')

	postDateStr = soup.find(attrs={'class': 'date-header'}).text.strip()
	postDateTime = datetime.strptime(postDateStr, '%A, %B %d, %Y')
	postDate = postDateTime.strftime("%Y%m%d")

	postTitle = soup.find(attrs={'class': 'post-title'}).text.strip()
	cleanPostTitle = ''.join(
	e for e in postTitle if e.isalnum() or e == ' ')
	subDirectory = postDate + "-" + cleanPostTitle

	postBody = soup.find(attrs={'class': 'post-body'})
	postImages = postBody.find_all('img')

	try:
	os.mkdir(os.path.join("Posts", subDirectory))
	except Exception:
	pass

	for img in postImages:
	imgParts = img.get('src').split('/')
	imgParts[-2] = 's2400'

	imgUrl = '/'.join(imgParts)

	try:
	urlretrieve(imgUrl, os.path.join(
	"Posts", subDirectory, os.path.basename(imgUrl)))
	except Exception:
	print("Failed to download for " + subDirectory + " " + imgUrl)

	except ConnectionResetError:

	print("Connection closed .. .try again.")
	return True, "Connection Closed for " + url


	def slurpBlog(blogUrl, year, month):
	"""Fetch text and images for a given blog year and month.

	Creates a post folder with a subfolder for each post.
	"""
	postLinks = getPostsByYear(blogUrl, year, month)

	folderName = os.path.join("Posts")
	try:
	print("Making folder: " + folderName)
	os.mkdir(folderName)
	except Exception:
	print("Failed to make folder " + folderName)
	pass

	for postLink in postLinks:
	post = getPostInfo(postLink)
	savePostInfo(post)
	savePostImages(postLink)


	def main():
	"""Run the script."""
	blogUrl = 'http://example.blogspot.com/'
	year = 2018

	for i in range(12):
	slurpBlog(blogUrl, year, i+1)


	if __name__ == '__main__':
	main()