Skip to content

Instantly share code, notes, and snippets.

@tetrillard
Last active September 13, 2021 21:23
Show Gist options
  • Save tetrillard/11201c7eed0a3d4d99413d7dcb214649 to your computer and use it in GitHub Desktop.
Save tetrillard/11201c7eed0a3d4d99413d7dcb214649 to your computer and use it in GitHub Desktop.
PearlTree dirty export from HTML (when it's too big to export by zip)
#!/usr/bin/python3
# pearltress export HTML sauvegarde
import re
import random
from urllib.parse import unquote
RE_FOLDER = re.compile('<H3 FOLDED ADD_DATE="[^"]*">([^<]*)<\/H3>')
RE_LINK = re.compile('HREF="([^"]*)"')
RE_NAME = re.compile('filename="([^"]*)"')
with open("export.html", "r") as f:
for line in f.readlines():
if "H3 FOLDED" in line:
folder = RE_FOLDER.search(line).group(1)
print(f'mkdir "{folder}"; cd "{folder}"')
if "</DL>" in line:
print("cd ..")
if "HREF" in line:
link = RE_LINK.search(line).group(1)
print(f'echo wget --content-disposition "{link}"')
print(f'wget --content-disposition "{link}"')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment