Skip to content

Instantly share code, notes, and snippets.

@santiagobasulto
Last active November 20, 2021 19:09
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save santiagobasulto/d7269de2f955933b551ca88f17744c42 to your computer and use it in GitHub Desktop.
Save santiagobasulto/d7269de2f955933b551ca88f17744c42 to your computer and use it in GitHub Desktop.
A quick script to transform a Markdown file's relative images to absolute by uploading them to a plugin-based service (S3, Imgur)
import re
import mimetypes
from pathlib import Path
import requests
import boto3
from botocore.exceptions import ClientError
PATTERN_FULL = '(?:!\[(?P<alt_text>.*?)\]\((?P<filename>.*?)\))'
PATTERN_FNAME = '(?:!\[(?:.*?)\]\((?P<filename>.*?)\))'
class Uploader:
def upload_image(self, image_path, override=False):
raise NotImplementedError()
class S3Uploader(Uploader):
def __init__(self, s3_bucket, s3_relative_path, s3_ACL=None, cloudfront_domain=None, cache_control=None):
self.bucket = s3_bucket
self.relative_path = s3_relative_path.rstrip('/').lstrip('/')
self.s3_acl = s3_ACL
self.cloudfront_domain = cloudfront_domain
self.cache_control = cache_control
assert not self.cloudfront_domain.startswith('http://')
assert not self.cloudfront_domain.startswith('https://')
self.client = boto3.client('s3')
def upload_image(self, image_path, override=False):
dns = self.cloudfront_domain or f'{self.bucket}.s3.amazonaws.com'
p = Path(image_path)
kwargs = {}
if self.cache_control:
kwargs['CacheControl'] = self.cache_control
if self.s3_acl:
kwargs['ACL'] = self.s3_acl
key = f'{self.relative_path}/{p.name}'
url = f"https://{dns}/{key}"
if not override:
try:
self.client.head_object(Bucket=self.bucket, Key=key)
print(f"\tFound: {url}")
return url
except ClientError as exc:
if exc.response['Error']['Code'] != "404":
raise exc
content_type, _ = mimetypes.guess_type(image_path)
if content_type:
kwargs['ContentType'] = content_type
with p.open('rb') as fp:
self.client.put_object(
Body=fp.read(),
Bucket=self.bucket,
Key=key,
**kwargs
)
print(f"\tUploaded: {url}")
return url
class ImgurUploader(Uploader):
def __init__(self, imgur_access_token):
self.access_token = imgur_access_token
def upload_image(self, image_path, override=False):
p = Path(image_path)
headers = {"Authorization": f"Bearer {self.access_token}"}
with p.open('rb') as fp:
files = {'image': fp}
resp = requests.post(
'https://api.imgur.com/3/image',
headers=headers, files=files)
resp.raise_for_status()
return resp.json()['data']['link']
UPLOADERS = {
's3': S3Uploader,
'imgur': ImgurUploader
}
def main(original_path, output_path, uploader, override=False, **uploader_kwargs):
"""Reads a markdown file, finds all the images and uploads them using `uploader`.
The result is a new file under `output_path`. Provide specific parameters
for the uploader with `uploader_kwargs`.
Parameters
----------
original_path: str, a valid filesystem path
The path of the markdown file used to transform.
output_path: str, a valid filesystem path
The path of where the resulting markdown file will be stored.
WARNING! This file will be overwritten.
uploader: str, a choice of uploaders
The uploader to use. Currently only supported in the `UPLOADERS` variable.
override: bool
Passed to the uploader, if the image should be overridden or not.
It's responsability of the uploader to respect this flag.
**uploader_kwargs: keyword arguments
Everything else will be passed to the Uploader at the moment of initialization.
"""
UploaderClass = UPLOADERS[uploader]
uploader = UploaderClass(**uploader_kwargs)
original_path = Path(original_path)
base_path = original_path.parent
pattern = re.compile(PATTERN_FNAME)
with original_path.open() as fp:
content = fp.read()
image_relative_paths = set(pattern.findall(content))
image_mapping = {
image_relative: (base_path / image_relative) for image_relative in image_relative_paths
}
missing_images = [str(abs_path) for _, abs_path in image_mapping.items() if not abs_path.exists()]
if missing_images:
raise ValueError(f'Missing images: {",".join(missing_images)}')
image_results = {
relative_path: uploader.upload_image(abs_path, override)
for relative_path, abs_path in image_mapping.items()
}
for relative_path, upload_path in image_results.items():
content = content.replace(relative_path, upload_path)
with open(output_path, 'w') as fp:
fp.write(content)
return image_results
CMD_REQUIRED_ARGUMENTS = {
's3': ['s3_bucket', 's3_relative_path']
}
if __name__ == "__main__":
import argparse
import pathlib
parser = argparse.ArgumentParser(description='Process some integers.')
parser.add_argument('input', type=pathlib.Path, help='A path to the markdown with relative images to transform')
parser.add_argument('output', type=pathlib.Path, help='A path to store the output of the process')
parser.add_argument('-u', '--uploader', choices=['s3', 'imgur'], required=True)
parser.add_argument('-o', '--override', action='store_const', const=True, default=False)
# S3 specific params
parser.add_argument('--s3-bucket')
parser.add_argument('--s3-relative-path', help="Where to store the images within the bucket. A key prefix.")
parser.add_argument('--s3-acl', default='private')
parser.add_argument('--s3-cf-domain', help="The cloudfront domain to use instead of S3's default URL")
parser.add_argument('--s3-cache-control')
# Imgur specific params
parser.add_argument('--imgur-access-token')
args = parser.parse_args()
assert all([bool(getattr(args, arg)) for arg in CMD_REQUIRED_ARGUMENTS[args.uploader]]), "Missing arguments"
if args.uploader == 's3':
results = main(
args.input,
args.output,
's3',
override=args.override,
s3_bucket=args.s3_bucket,
s3_relative_path=args.s3_relative_path,
s3_ACL=args.s3_acl,
cloudfront_domain=args.s3_cf_domain,
cache_control=args.s3_cache_control,
)
else:
results = results = main(
args.input,
args.output,
'imgur',
override=args.override,
imgur_access_token=args.imgur_access_token,
)
print('\n')
print('-' * 60)
print(f"Replaced {len(results)} images")
@santiagobasulto
Copy link
Author

I created this script in just a few minutes to solve an issue with a bunch of MD files that were exported with relative, local images and I needed them absolute and hosted on S3. Please make sure you read how it works before using it.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment