Last active
January 29, 2019 21:54
-
-
Save briglx/3cd9a87359ce167f9c1f399ac42a025c to your computer and use it in GitHub Desktop.
Script to slurp blog posts from blogger
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
"""Slurp all Images and text from the blog into a folder.""" | |
from urllib.request import urlopen, urlretrieve | |
from bs4 import BeautifulSoup | |
from datetime import datetime | |
import os | |
import re | |
import os.path | |
def getPostsByYear(blogUrl, year, month): | |
"""Get url for each post by month and year return as list. | |
Defaults to returning empty list if no posts are found. | |
""" | |
url = blogUrl + str(year) + '/' + str(month).zfill(2) | |
f = urlopen(url) | |
soup = BeautifulSoup(f, 'html.parser') | |
archiveList = soup.find(attrs={'id': 'BlogArchive1_ArchiveList'}) | |
uls = archiveList.findChildren('ul', recursive=False) | |
pattern = re.compile( | |
r"^(\w|\:|\/|\.)+" + | |
str(year) + "/" + | |
str(month).zfill(2) + | |
r"/(\w|\-)+" | |
) | |
for ul in uls: | |
children = ul.findChildren(recursive=False) | |
for child in children: | |
a = child.find( | |
'a', | |
attrs={'class', 'post-count-link'}, | |
recursive=False | |
) | |
linkYear = int(a.text.strip()) | |
if(linkYear == year): | |
# ml = child.find( | |
# 'a', | |
# attrs={'class', 'post-count-link'}, | |
# recursive=False | |
# ) | |
monthLinks = ul.find_all('a') | |
elinks = [ml.get('href') for ml in monthLinks | |
if pattern.match(ml.get('href'))] | |
return elinks | |
def getPostInfo(url): | |
"""Get post text and list of images and return as tuple. | |
Defaults to returning empty tuple if post is not found. | |
""" | |
try: | |
f = urlopen(url) | |
soup = BeautifulSoup(f, 'html.parser') | |
postDateStr = soup.find(attrs={'class': 'date-header'}).text.strip() | |
postDateTime = datetime.strptime(postDateStr, '%A, %B %d, %Y') | |
postDate = postDateTime.strftime("%Y%m%d") | |
postTitle = soup.find(attrs={'class': 'post-title'}).text.strip() | |
cleanPostTitle = ''.join( | |
e for e in postTitle if e.isalnum() or e == ' ') | |
postBody = soup.find(attrs={'class': 'post-body'}) | |
# lines = postBody.text.strip().splitlines() | |
postBodyClean = "\n\n".join( | |
[s.strip() for s in postBody.text.strip().splitlines() if s]) | |
postImages = postBody.find_all('img') | |
postBodyClean = postBodyClean + "\n\n" | |
for img in postImages: | |
imgNameParts = img.get('src').split('/') | |
postBodyClean = postBodyClean + imgNameParts[-1] + "\n" | |
return (postDate + "-" + cleanPostTitle + ".txt", postBodyClean) | |
except ConnectionResetError: | |
print("Connection closed .. .try again.") | |
return True, "Connection Closed for " + url | |
def savePostInfo(post): | |
"""Save post data to a file.""" | |
subdirectory = post[0].split('.')[0] | |
fileName = post[0] | |
postText = post[1] | |
folderName = os.path.join("Posts", subdirectory) | |
try: | |
print("Making folder: " + folderName) | |
os.mkdir(folderName) | |
except Exception: | |
print("Failed to make folder " + folderName) | |
pass | |
try: | |
fullFileName = os.path.join(folderName, fileName) | |
with open( | |
fullFileName, | |
'w', encoding='utf8' | |
) as file: | |
file.write(postText) | |
except Exception: | |
print("Failed to save file " + fullFileName) | |
print(Exception) | |
def savePostImages(url): | |
"""Download post images and save to folder.""" | |
try: | |
f = urlopen(url) | |
soup = BeautifulSoup(f, 'html.parser') | |
postDateStr = soup.find(attrs={'class': 'date-header'}).text.strip() | |
postDateTime = datetime.strptime(postDateStr, '%A, %B %d, %Y') | |
postDate = postDateTime.strftime("%Y%m%d") | |
postTitle = soup.find(attrs={'class': 'post-title'}).text.strip() | |
cleanPostTitle = ''.join( | |
e for e in postTitle if e.isalnum() or e == ' ') | |
subDirectory = postDate + "-" + cleanPostTitle | |
postBody = soup.find(attrs={'class': 'post-body'}) | |
postImages = postBody.find_all('img') | |
try: | |
os.mkdir(os.path.join("Posts", subDirectory)) | |
except Exception: | |
pass | |
for img in postImages: | |
imgParts = img.get('src').split('/') | |
imgParts[-2] = 's2400' | |
imgUrl = '/'.join(imgParts) | |
try: | |
urlretrieve(imgUrl, os.path.join( | |
"Posts", subDirectory, os.path.basename(imgUrl))) | |
except Exception: | |
print("Failed to download for " + subDirectory + " " + imgUrl) | |
except ConnectionResetError: | |
print("Connection closed .. .try again.") | |
return True, "Connection Closed for " + url | |
def slurpBlog(blogUrl, year, month): | |
"""Fetch text and images for a given blog year and month. | |
Creates a post folder with a subfolder for each post. | |
""" | |
postLinks = getPostsByYear(blogUrl, year, month) | |
folderName = os.path.join("Posts") | |
try: | |
print("Making folder: " + folderName) | |
os.mkdir(folderName) | |
except Exception: | |
print("Failed to make folder " + folderName) | |
pass | |
for postLink in postLinks: | |
post = getPostInfo(postLink) | |
savePostInfo(post) | |
savePostImages(postLink) | |
def main(): | |
"""Run the script.""" | |
blogUrl = 'http://example.blogspot.com/' | |
year = 2018 | |
for i in range(12): | |
slurpBlog(blogUrl, year, i+1) | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment