Skip to content

Instantly share code, notes, and snippets.

@3outeille
Last active January 26, 2022 19:48
Show Gist options
  • Save 3outeille/1cd8997246fe2f906b9dddf2cf6820a6 to your computer and use it in GitHub Desktop.
Save 3outeille/1cd8997246fe2f906b9dddf2cf6820a6 to your computer and use it in GitHub Desktop.
Standalone script which parse hackmd markdown (including imgur link) and create directory (from tags) with downloaded imgur link
  • Developped under Python 3.7.12
  • Standalone script which parse hackmd markdown (including imgur link) and create directory (from tags) with downloaded imgur link
  • To use, python3 script.py --zip_file <file.zip>
  • To export your hackmd markdown as zip file
import glob
import re
import itertools
import os
import argparse
import zipfile
import shutil
import subprocess
import filecmp
import logging
import sys
def progressbar(it, prefix="", size=60, file=sys.stdout):
#https://stackoverflow.com/a/34482761
count = len(it)
def show(j):
x = int(size*j/count)
file.write("%s[%s%s] %i/%i\r" % (prefix, "#"*x, "."*(size-x), j, count))
file.flush()
show(0)
for i, item in enumerate(it):
yield item
show(i+1)
file.write("\n")
file.flush()
logger = logging.getLogger(__name__)
logging.basicConfig(format='%(asctime)s - %(message)s', filename='log.txt', filemode='w', level=logging.INFO)
def getTags(lines):
if len(lines) < 2: return []
is_tags = re.search(r"tags:(.*)\n", lines[1])
if is_tags is None: return []
tags = list(is_tags.groups()[0].replace(" ", "").split(","))
if len(tags) == 1 and tags[0] == "":
return []
return tags
def getPath(tags):
if len(tags) == 0:
return "no_tags"
else:
paths = []
for permut in itertools.permutations(tags, r=len(tags)):
tmp_path = "/".join(permut)
if len(paths) == 0 or os.path.exists(tmp_path):
paths.append(tmp_path)
# If one of the path already exists, use it (there should only be one).
# Else use the first one proposed.
if len(paths) == 0:
raise ValueError("Path formed with these tags should be unique.")
else:
path = paths[-1]
return path
def preprocess_filename(filename):
replacements = {
"[-!$%^&*()+|~=`{}\[\]:\";'<>?,.\/ ]" : "_"
}
for pattern, repl in replacements.items():
# re.I =. Ignore case
filename = re.sub(pattern, repl, filename, flags=re.I)
return filename
def archive_markdown(files):
count_parsed_files = 0
for i in progressbar(range(len(files)), "File parsed: "):
file = files[i]
with open(file) as f:
lines = f.readlines()
# empty file
if len(lines) == 0:
logger.warning("[WARN] {} is an empty file. It will not be created".format(file))
count_parsed_files += 1
continue
tags = getTags(lines)
path = getPath(tags)
if not file.endswith(".md"):
logger.error("[ERROR] {} is not a markdown file".format(file))
continue
# Create associate folder for file
filename, _ = file.split("/")[-1].split(".md")
filename = preprocess_filename(filename)
fullpath = path + "/" + filename
fullpath_filename_extension = fullpath + "/" + filename + ".md"
# Do nothing if no diff with previous version of file
if os.path.exists(fullpath_filename_extension) and filecmp.cmp(file, fullpath_filename_extension):
logger.info("[INFO] '{}' no change.".format(file))
count_parsed_files += 1
continue
else:
if os.path.exists(fullpath):
shutil.rmtree(fullpath)
os.makedirs(fullpath)
# copy file to associate folder
shutil.copy(file, fullpath_filename_extension)
# Parse image url
output = None
try:
ps = subprocess.Popen("grep -oP \"!\[\]\(https://i.imgur.com/.+.png\)\" {}".format(fullpath_filename_extension), shell=True, stdout=subprocess.PIPE)
output = subprocess.check_output("grep -oP \"(?=h)[^\)]*\"", shell=True, stdin=ps.stdout)
ps.wait()
except subprocess.CalledProcessError:
output = None
logger.warning("[WARN] {} has no image url to parse".format(file))
if output is not None:
image_urls = output.decode("ascii").split("\n")
image_urls = list(filter(lambda x: len(x) > 0, image_urls))
for i, url in enumerate(image_urls):
try:
subprocess.run("wget -q -O {}.png {}".format(fullpath + "/" + str(i), url), shell=True, check=True)
except subprocess.CalledProcessError:
if os.path.exists(fullpath + "/" + str(i) + ".png"):
os.remove(fullpath + "/" + str(i) + ".png")
logger.error("[ERROR] {} of '{}' is not valid.".format(url, file))
logger.info("[DONE] '{}'".format(file))
logger.info("-"*20)
count_parsed_files += 1
logger.info("[END] Ratio parsed_files/total_files: {} / {}".format(count_parsed_files, len(files)))
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument("--zip_file")
args = parser.parse_args()
if os.path.exists("tmp/"):
shutil.rmtree("tmp/")
os.makedirs("tmp/")
with zipfile.ZipFile(args.zip_file, 'r') as zip_ref:
zip_ref.extractall("tmp/")
files = glob.glob("tmp/*.md")
archive_markdown(files)
print("Informations are available in log.txt")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment