Skip to content

Instantly share code, notes, and snippets.

@briglx
Last active January 29, 2019 21:54
Show Gist options
  • Save briglx/3cd9a87359ce167f9c1f399ac42a025c to your computer and use it in GitHub Desktop.
Save briglx/3cd9a87359ce167f9c1f399ac42a025c to your computer and use it in GitHub Desktop.
Script to slurp blog posts from blogger
#!/usr/bin/env python3
"""Slurp all Images and text from the blog into a folder."""
from urllib.request import urlopen, urlretrieve
from bs4 import BeautifulSoup
from datetime import datetime
import os
import re
import os.path
def getPostsByYear(blogUrl, year, month):
"""Get url for each post by month and year return as list.
Defaults to returning empty list if no posts are found.
"""
url = blogUrl + str(year) + '/' + str(month).zfill(2)
f = urlopen(url)
soup = BeautifulSoup(f, 'html.parser')
archiveList = soup.find(attrs={'id': 'BlogArchive1_ArchiveList'})
uls = archiveList.findChildren('ul', recursive=False)
pattern = re.compile(
r"^(\w|\:|\/|\.)+" +
str(year) + "/" +
str(month).zfill(2) +
r"/(\w|\-)+"
)
for ul in uls:
children = ul.findChildren(recursive=False)
for child in children:
a = child.find(
'a',
attrs={'class', 'post-count-link'},
recursive=False
)
linkYear = int(a.text.strip())
if(linkYear == year):
# ml = child.find(
# 'a',
# attrs={'class', 'post-count-link'},
# recursive=False
# )
monthLinks = ul.find_all('a')
elinks = [ml.get('href') for ml in monthLinks
if pattern.match(ml.get('href'))]
return elinks
def getPostInfo(url):
"""Get post text and list of images and return as tuple.
Defaults to returning empty tuple if post is not found.
"""
try:
f = urlopen(url)
soup = BeautifulSoup(f, 'html.parser')
postDateStr = soup.find(attrs={'class': 'date-header'}).text.strip()
postDateTime = datetime.strptime(postDateStr, '%A, %B %d, %Y')
postDate = postDateTime.strftime("%Y%m%d")
postTitle = soup.find(attrs={'class': 'post-title'}).text.strip()
cleanPostTitle = ''.join(
e for e in postTitle if e.isalnum() or e == ' ')
postBody = soup.find(attrs={'class': 'post-body'})
# lines = postBody.text.strip().splitlines()
postBodyClean = "\n\n".join(
[s.strip() for s in postBody.text.strip().splitlines() if s])
postImages = postBody.find_all('img')
postBodyClean = postBodyClean + "\n\n"
for img in postImages:
imgNameParts = img.get('src').split('/')
postBodyClean = postBodyClean + imgNameParts[-1] + "\n"
return (postDate + "-" + cleanPostTitle + ".txt", postBodyClean)
except ConnectionResetError:
print("Connection closed .. .try again.")
return True, "Connection Closed for " + url
def savePostInfo(post):
"""Save post data to a file."""
subdirectory = post[0].split('.')[0]
fileName = post[0]
postText = post[1]
folderName = os.path.join("Posts", subdirectory)
try:
print("Making folder: " + folderName)
os.mkdir(folderName)
except Exception:
print("Failed to make folder " + folderName)
pass
try:
fullFileName = os.path.join(folderName, fileName)
with open(
fullFileName,
'w', encoding='utf8'
) as file:
file.write(postText)
except Exception:
print("Failed to save file " + fullFileName)
print(Exception)
def savePostImages(url):
"""Download post images and save to folder."""
try:
f = urlopen(url)
soup = BeautifulSoup(f, 'html.parser')
postDateStr = soup.find(attrs={'class': 'date-header'}).text.strip()
postDateTime = datetime.strptime(postDateStr, '%A, %B %d, %Y')
postDate = postDateTime.strftime("%Y%m%d")
postTitle = soup.find(attrs={'class': 'post-title'}).text.strip()
cleanPostTitle = ''.join(
e for e in postTitle if e.isalnum() or e == ' ')
subDirectory = postDate + "-" + cleanPostTitle
postBody = soup.find(attrs={'class': 'post-body'})
postImages = postBody.find_all('img')
try:
os.mkdir(os.path.join("Posts", subDirectory))
except Exception:
pass
for img in postImages:
imgParts = img.get('src').split('/')
imgParts[-2] = 's2400'
imgUrl = '/'.join(imgParts)
try:
urlretrieve(imgUrl, os.path.join(
"Posts", subDirectory, os.path.basename(imgUrl)))
except Exception:
print("Failed to download for " + subDirectory + " " + imgUrl)
except ConnectionResetError:
print("Connection closed .. .try again.")
return True, "Connection Closed for " + url
def slurpBlog(blogUrl, year, month):
"""Fetch text and images for a given blog year and month.
Creates a post folder with a subfolder for each post.
"""
postLinks = getPostsByYear(blogUrl, year, month)
folderName = os.path.join("Posts")
try:
print("Making folder: " + folderName)
os.mkdir(folderName)
except Exception:
print("Failed to make folder " + folderName)
pass
for postLink in postLinks:
post = getPostInfo(postLink)
savePostInfo(post)
savePostImages(postLink)
def main():
"""Run the script."""
blogUrl = 'http://example.blogspot.com/'
year = 2018
for i in range(12):
slurpBlog(blogUrl, year, i+1)
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment