Skip to content

Instantly share code, notes, and snippets.

@mufidu
Last active April 28, 2024 06:18
Show Gist options
  • Star 3 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save mufidu/f7b795f844f1ee4dc78e55123d5a398b to your computer and use it in GitHub Desktop.
Save mufidu/f7b795f844f1ee4dc78e55123d5a398b to your computer and use it in GitHub Desktop.
Download all images in markdown files and rename the links to point to the local files
# Download all images from all FOLDER_NAME in FOLDER_NAME folder
# and save them to FOLDER_NAME/_attachments folder.
# Then replace remote image paths with local paths.
# Usage: python download_imgs.py
import os
import urllib
import urllib.parse
import urllib.request
import re
import time
import socket
# Change this to the folder name containing your markdown files
FOLDER_NAME = "articles"
# Get all files in FOLDER_NAME folder
files = os.listdir(f'{FOLDER_NAME}')
# Loop through all files
for file in files:
# Open file if it's a markdown file
if not file.endswith('.md'):
continue
f = open(f'{FOLDER_NAME}/' + file, 'r')
# Read file
content = f.read()
# Loop through all images in file (markdown syntax)
for img in re.findall(r'!\[.*\]\((.*)\)', content):
# Get image name, use timestamp to avoid duplicates
name = str(time.time()) + os.path.basename(urllib.parse.urlparse(img).path)
# Create FOLDER_NAME/_attachments folder if it doesn't exist
if not os.path.exists(f'{FOLDER_NAME}/_attachments'):
os.makedirs(f'{FOLDER_NAME}/_attachments')
try:
# Download image to FOLDER_NAME/_attachments folder
# Use Python Requests instead of urlretrieve to avoid 403 errors
headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
'Accept-Encoding': 'none',
'Accept-Language': 'en-US,en;q=0.8',
'Connection': 'keep-alive'}
req = urllib.request.Request(img, headers=headers)
response = urllib.request.urlopen(req)
f = open(f'{FOLDER_NAME}/_attachments/' + name, 'wb')
f.write(response.read())
# Add timeout to avoid hanging
socket.setdefaulttimeout(30)
# Replace image path with local path
content = content.replace(img, '_attachments/' + name)
# Print status
print('Downloaded ' + img + f' to {FOLDER_NAME}/_attachments/' + name)
# Print error details
except Exception as e:
print('Error downloading ' + img + ': ' + str(e))
# Sleep for 1 second to avoid rate limiting, if any
time.sleep(1)
# Close file
f.close()
# Write file
f = open(f'{FOLDER_NAME}/' + file, 'w')
f.write(content)
# Close file
f.close()
@mufidu
Copy link
Author

mufidu commented Dec 15, 2022

Should've processed the URLs first to deal with weird links. E.g. Insider's images' links look like this: {"https://i.insider.com/5699688ce6183e46258b7648":{"contentType":"image/png","aspectRatioW":1575,"aspectRatioH":1181}}.

But it works for normal links.

@boxydog
Copy link

boxydog commented Dec 4, 2023

This was helpful for me, thanks.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment