Skip to content

Instantly share code, notes, and snippets.

@villares
Last active November 13, 2023 00:10
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save villares/f4f35630c28222767dd276fd08bcdbb5 to your computer and use it in GitHub Desktop.
Save villares/f4f35630c28222767dd276fd08bcdbb5 to your computer and use it in GitHub Desktop.
Backup HackMD.io Notes
"""
Downloads images from my notes at HackMD.io.
Creates new MD files with relative path image links.
Uses the links from the 'notes-backup' files downloaded with
download_notes.py (run it first)
I was skipping images hosted from my dreamhost account
but decided to download them too.
"""
import re
import requests
from pathlib import Path
self_hosted = 'lugaralgum.com/hackmd'
input_dir = Path.cwd() / 'notes-backup'
output_dir = Path.cwd() / 'notes-with-local-images'
output_dir.mkdir(exist_ok=True)
def download_images_from_md_files(directory_path, out_path):
for md_file in Path(directory_path).rglob("*.md"):
out_file = out_path / md_file.name
with md_file.open() as f:
content = f.read()
img_links = re.findall(r"!\[.*?\]\((.*?)\)", content)
for link in img_links:
if link.startswith("http"):
img_name = link.split("/")[-1]
img_path = out_path / img_name
# download only external links and not already downloaded
#if (self_hosted not in link) and not img_path.is_file():
# download only not already downloaded
if not img_path.is_file():
try:
response = requests.get(link)
except Exception:
print(f'Skipped {link}')
continue
if response.status_code == 200:
with open(img_path, "wb") as img_file:
img_file.write(response.content)
print(f"Downloaded {img_name} from {md_file.name}.")
else:
print(f"FAILED TO DOWNLOAD {img_name} from {md_file.name}.")
else: # modify self_hosted links (not downloaded)
content = content.replace(link, img_name)
else:
print(f'Skipped {link}')
with out_file.open("w") as f:
f.write(content)
print(f'saved {out_file.name}.')
download_images_from_md_files(input_dir, output_dir)
from datetime import datetime
from pathlib import Path
import pickle
try:
import tomllib
except ImportError:
import tomli as tomllib # tomllib will be in Python 3.11's standard library only
from PyHackMD import API # Install with pip, more info at https://github.com/GoatWang/PyHackMD
access_token = '<PUT YOUR ACCESS TOKEN HERE>' # get one at https://hackmd.io/settings#api
# # If you make a TOML file for your access tokens names api_tokens: [hackmd] ... access_token = ...
# with open("/home/username/api_tokens", "rb") as f:
# api_tokens = tomllib.load(f)
# access_token = api_tokens['hackmd']['access_token']
output_dir = Path.cwd() / 'notes-backup'
output_dir.mkdir(exist_ok=True)
note_list_pickle_file = output_dir.parent / 'note_list.data'
api = API(access_token)
note_list = api.get_note_list()
try:
with open(note_list_pickle_file, 'rb') as f:
previous_note_list = pickle.load(f)
except FileNotFoundError:
print('A new note_list.data file will be created.')
previous_note_list = []
def find_previous_by_id(id):
for note_data in previous_note_list:
if note_data['id'] == note_id:
return note_data
return {}
for note_data in note_list:
note_id = note_data['id']
previous = find_previous_by_id(note_id)
# the following if conditional on the note having been changed recently
if not previous or previous['lastChangedAt'] != note_data['lastChangedAt']:
print(f'Downloading... '
f'{note_data["id"]} '
f'{note_data["title"]} '
f'{note_data["publishLink"]}')
content = api.get_note(note_data['id'])['content']
permalink = note_data["permalink"]
file_name = (f'{permalink}.md' if permalink is not None
else f'{note_data["title"]}.md'.replace('/', '-').replace(' ', '-'))
with open(output_dir / file_name, 'w', encoding='utf-8') as f:
f.write(content)
else:
last_changed_unix_dts = note_data['lastChangedAt'] / 1000
dts = datetime.utcfromtimestamp(last_changed_unix_dts).strftime('%Y-%m-%d %H:%M:%S')
print(f'Skipping... '
f'{note_data["id"]} '
f'{note_data["title"]} '
f'{dts}')
with open(note_list_pickle_file, 'wb') as f:
pickle.dump(note_list, f)
# you should be able to run download_images_modify_files.py after this
@villares
Copy link
Author

villares commented Jul 3, 2023

The final step was to change imgur URLs to my dreamhost served images... now HackMD added itself some (limited) image hosting I didn't run again.

"""
Changes image links on my HackMD.io pages to my_server_path/image_file_name

Better run this only after uploading all images to my hosting account
dir '/home/abav/public_example.com/hackmd'.
"""

import re

from pathlib import Path
import tomli as tomllib # tomllib will be in Python 3.11's standard library only
from PyHackMD import API # https://github.com/GoatWang/PyHackMD

images_dir = Path.cwd() / 'notes-with-local-images'
my_url = 'https://example.com/hackmd/'

access_token = '<PUT YOUR ACCESS TOKEN HERE>'  # get one at https://hackmd.io/settings#api
# # If you make a TOML file for your access tokens names api_tokens: [hackmd] ... access_token = ...
# with open("/home/username/api_tokens", "rb") as f:
#    api_tokens = tomllib.load(f)
# access_token = api_tokens['hackmd']['access_token']

api = API(access_token)
note_list = api.get_note_list()

for note_data in note_list:
    #print(' '.join(note_data[field] for field in ('id', 'title', 'publishLink')))
    note_id = note_data['id']
    content = api.get_note(note_id)['content']
    img_links = re.findall(r"!\[.*?\]\((.*?)\)", content)
    needs_update = False
    for link in img_links:
        print(note_data['title'], link, end=' -> ')
        if link.startswith("http") and my_url not in link:
            img_name = link.split("/")[-1]
            previous_content = content[:]
            content = content.replace(link, my_url + img_name)
            print(content != previous_content)
            api.update_note(note_id, content)
            needs_update = True
        else:
            print('skipped.')
    if needs_update:
        print('UPDATED ' + note_data['title'])
    else:
        print('UNCHANGED ' + note_data['title'])

@villares
Copy link
Author

villares commented Sep 21, 2023

Uploading to Dreamhost:

"""
This will upload all hackmd backup image files to my Dreamhost account
using SFTP with Python paramiko library.

I had to add the server to known hosts with:
ssh-keyscan [host name here] >> ~/.ssh/known_hosts

Skips overwriting existing files, this is mostly good for the images that will 
change names if they get updated, but this is not very good for the .md files
that get updated a lot, with the same file names. 
You can let it update/upload/overwrite the markdown files with SKIP_MARKDOWN = False.
"""

from pathlib import Path
import tomli as tomllib # tomllib will be in Python 3.11's standard library only

import paramiko

paramiko.util.log_to_file('ssh-session.log') # sets up logging

with open("/home/villares/api_tokens", "rb") as f:
    api_tokens = tomllib.load(f)  

host = api_tokens['dreamhost']['host']
username = api_tokens['dreamhost']['username'] 
password = api_tokens['dreamhost']['password'] 
remote_dir = '/home/abav/public_lugaralgum.com/hackmd'
# local_dir = Path.cwd() / 'notes-with-local-images'
local_dir = '/home/villares/GitHub/hackmd-backup/notes-with-local-images/'
   
SKIP_MARKDOWN = True
   
with paramiko.SSHClient() as ssh:
    #ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy()) # unsafe!
    ssh.load_system_host_keys()  # get keys from known_hosts
    ssh.connect(host, username=username, password=password)

    with ssh.open_sftp() as sftp:
        sftp.chdir(remote_dir)
        
        for local_file in sorted(Path(local_dir).iterdir()):
            remote_file = local_file.name
            if remote_file.lower().endswith('.md'):
                if SKIP_MARKDOWN:
                    print(f'Skipping markdown file {remote_file}.')
                else:
                    sftp.put(local_file, remote_file)
                    print(f'Uploaded markdown {remote_file} (with possible overwrite).')
            else:
                try:
                    sftp.stat(remote_file)
                    print(f'Skipping {remote_file} already on server.')
                except FileNotFoundError:
                    sftp.put(local_file, remote_file)
                    print(f'Uploaded {remote_file}.')
        
# import paramiko
# 
# transport = paramiko.Transport((host, 22))
# transport.connect(username=username, password=password)
# 
# security_options = transport.get_security_options()
# print(security_options.ciphers)
# print(security_options.kex)
# 
# transport.close()
"""
('aes128-ctr', 'aes192-ctr', 'aes256-ctr', 'aes128-cbc', 'aes192-cbc', 'aes256-cbc', 'blowfish-cbc', '3des-cbc')
('curve25519-sha256@libssh.org', 'ecdh-sha2-nistp256', 'ecdh-sha2-nistp384', 'ecdh-sha2-nistp521', 'diffie-hellman-group16-sha512', 'diffie-hellman-group-exchange-sha256', 'diffie-hellman-group14-sha256', 'diffie-hellman-group-exchange-sha1', 'diffie-hellman-group14-sha1', 'diffie-hellman-group1-sha1')
"""

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment