Skip to content

Instantly share code, notes, and snippets.

@paulschow
Created December 25, 2022 05:14
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save paulschow/273f14698e09e7b0435334cba093a160 to your computer and use it in GitHub Desktop.
Save paulschow/273f14698e09e7b0435334cba093a160 to your computer and use it in GitHub Desktop.
Script to download images from blogger and replace them in a markdown file
# Script to download images from blogger and replace them in a markdown file
# Works with markdown files generated by blog2md
# https://github.com/palaniraja/blog2md
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
import time
import urllib
from selenium.webdriver.firefox.options import Options
import os
# Use selenium with a profile that is logged into google
ffOptions = Options()
ffOptions.add_argument("-profile")
ffOptions.add_argument(r'/home/paul/snap/firefox/common/.mozilla/firefox/cujaaab5.seltest')
driver = webdriver.Firefox(options=ffOptions)
# Open index.md file as lines
inputfile = open('index.md', 'r')
Lines = inputfile.readlines()
inputfile.close()
# Rename the index.md file
os.rename('index.md', 'index.mdold')
count = 0
for line in Lines:
count += 1
# images generated by blog2md have lines that begin with [!
if line[0:2] == "[!":
#print (line)
stext = line.split("(")
#print (stext)
#print (stext[1])
# Get the image url
imgurl = stext[1].split(")")[0]
#print (imgurl)
# Get the filename from the url
filename = imgurl.split("/")[-1]
#print (filename)
# Get the extension of the file
# This works 90% of the time
extension = filename.split(".")[1]
#print (extension)
# Create a new file name to avoid errors
newfilename = "image" + str(count-1) + "." + extension
mdname = "![" + newfilename + "](" + newfilename + ")"
#mdname = "![" + filename + "](" + filename + ")"
#print (mdname)
Lines[count-1] = mdname
# Use archive.org to get images for one post that was very broken
#archiveurl = "https://web.archive.org/web/20190225130331im_/" + imgurl
#driver.get(archiveurl)
# Get the image and save it
drive.get(imgurl)
img = driver.find_element(By.XPATH, '/html/body/img')
src = img.get_attribute('src')
urllib.request.urlretrieve(src, filename)
os.rename( filename, newfilename)
#print (Lines[10])
driver.close()
# Write the lines back into a markdown file
newfile = open('index.md', 'w')
newfile.writelines(Lines)
newfile.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment