Last active
February 26, 2024 04:05
-
-
Save pirate/63924cac848f3519ae425aeccf0721a9 to your computer and use it in GitHub Desktop.
Archive pocket data to local folder
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
# Nick Sweeting 2017 | MIT License | |
# Usage: | |
# 1. Download pocket export from https://getpocket.com/export to ril_export.html | |
# 2. Run ./dump_pocket.py ril_export.html | |
# 4. Archive is saved in "pocket" folder | |
# Dependencies (chrome, python3, wget): | |
# wget -q -O - https://dl-ssl.google.com/linux/linux_signing_key.pub | sudo apt-key add - | |
# sudo sh -c 'echo "deb [arch=amd64] http://dl.google.com/linux/chrome/deb/ stable main" >> /etc/apt/sources.list.d/google-chrome.list' | |
# apt update; apt install google-chrome-beta python3 wget | |
import re | |
import os | |
import sys | |
from datetime import datetime | |
from subprocess import run, DEVNULL | |
RESOLUTION = '1440,900' # screenshot resolution | |
def parse_pocket_export(html): | |
pattern = re.compile("^\\s*<li><a href=\"(.+)\" time_added=\"(\\d+)\" tags=\"(.*)\">(.+)</a></li>", re.UNICODE) | |
for line in html: | |
match = pattern.search(line) | |
if match: | |
yield { | |
'url': match.group(1).replace('http://www.readability.com/read?url=', ''), | |
'domain': match.group(1).replace('http://', '').replace('https://', '').split('/')[0], | |
'base_url': match.group(1).replace('https://', '').replace('http://', '').split('?')[0], | |
'time': datetime.fromtimestamp(int(match.group(2))), | |
'timestamp': match.group(2), | |
'tags': match.group(3), | |
'title': match.group(4).replace(' β Readability', '').replace('http://www.readability.com/read?url=', ''), | |
} | |
def dump_index(links): | |
index_html = """ | |
<html lang="en"> | |
<head> | |
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8"/> | |
<title>Archived Sites | Nick Sweeting</title> | |
<style> | |
html, body {{ | |
width: 100%; | |
height: 100%; | |
font-size: 20px; | |
font-weight: 200; | |
text-align: center; | |
margin: 0px; | |
padding: 0px; | |
font-family: "Gill Sans", Helvetica, sans-serif; | |
}} | |
header {{ | |
background-color: #aa1e55; | |
color: white; | |
padding: 10px; | |
}} | |
header h1 {{ | |
font-weight: 300; | |
color: black; | |
margin-top: 10px; | |
margin-bottom: 12px; | |
}} | |
header h1 small {{ | |
color: white; | |
font-size:0.5em; | |
}} | |
header h1 small a {{ | |
text-decoration: none; | |
color: orange; | |
opacity: 0.6 | |
font-weight: 300; | |
}} | |
header h1 small a:hover {{ | |
opacity: 1; | |
}} | |
table {{ | |
padding: 6px; | |
width: 100%; | |
}} | |
table thead th {{ | |
font-weight: 400; | |
}} | |
tbody tr:nth-child(odd) {{ | |
background-color: #ffebeb; | |
}} | |
table tr td {{ | |
white-space: nowrap; | |
overflow: hidden; | |
padding-bottom: 0.4em; | |
padding-top: 0.4em; | |
padding-left: 2px; | |
}} | |
table tr td img {{ | |
height: 24px; | |
padding: 0px; | |
padding-right: 5px; | |
text-indent: -10000px; | |
}} | |
</style> | |
</head> | |
<body> | |
<header> | |
<h1 stitle="Last modified {}"> | |
<img src="https://nicksweeting.com/images/archive.png" height="36px"> | |
Nick Sweeting | Archived Sites <img src="https://getpocket.com/favicon.ico" height="36px"> <br/> | |
<small> | |
Via: <a href="https://getpocket.com/export">getpocket.com/export</a> + <a href="https://gist.github.com/pirate/63924cac848f3519ae425aeccf0721a9">archive_pocket.py</a> | |
| <a href="https://getpocket.com/users/nikisweeting/feed/all">RSS Feed</a> | <a href="https://sweeting.me">sweeting.me</a> | |
</small> | |
</h1> | |
</header> | |
<table style="width:100%;height: 90%; overflow-y: scroll;table-layout: fixed"> | |
<thead> | |
<tr> | |
<th style="width: 140px;"><img src="https://getpocket.com/favicon.ico" height="12px"> Pocketed Date</th> | |
<th style="width: 45vw;">Saved Article</th> | |
<th style="width: 50px">Files</th> | |
<th style="width: 50px">PDF</th> | |
<th style="width: 80px">Screenshot</th> | |
<th style="width: 100px;whitespace:nowrap;overflow-x:scroll;display:block">Original URL</th> | |
</tr> | |
</thead> | |
<tbody>{}</tbody> | |
</table> | |
</body> | |
</html> | |
""" | |
link_html = """\ | |
<tr> | |
<td>{time}</td> | |
<td><a href="archive/{timestamp}/{base_url}" style="font-size:1.4em;text-decoration:none;color:black;" title="{title}"> | |
<img src="archive/{timestamp}/favicon.ico"> | |
{title} | |
</td> | |
<td style="text-align:center"><a href="archive/{timestamp}/" title="Files">π</a></td> | |
<td style="text-align:center"><a href="archive/{timestamp}/output.pdf" title="PDF">π</a></td> | |
<td style="text-align:center"><a href="archive/{timestamp}/screenshot.png" title="Screenshot">πΌ</a></td> | |
<td>π <img src="https://www.google.com/s2/favicons?domain={domain}" height="16px"> <a href="{url}">{url}</a></td> | |
</tr>""" | |
with open('pocket/index.html', 'w') as f: | |
article_rows = '\n'.join( | |
link_html.format(**link) for link in links | |
) | |
f.write(index_html.format(datetime.now().strftime('%Y-%m-%d %H:%M'), article_rows)) | |
def dump_website(link, overwrite=False): | |
"""download the DOM, PDF, and a screenshot into a folder named after the link's timestamp""" | |
print('[+] [{time}] Archiving "{title}": {url}'.format(**link)) | |
out_dir = 'pocket/archive/{timestamp}'.format(**link) | |
if not os.path.exists(out_dir): | |
os.makedirs(out_dir) | |
if link['base_url'].endswith('.pdf'): | |
print(' i PDF File') | |
elif 'youtube.com' in link['domain']: | |
print(' i Youtube Video') | |
elif 'wikipedia.org' in link['domain']: | |
print(' i Wikipedia Article') | |
elif 'wikipedia.org' in link['domain']: | |
print(' i Wikipedia Article') | |
# download full site | |
if not os.path.exists('{}/{}'.format(out_dir, link['domain'])) or overwrite: | |
print(' - Downloading Full Site') | |
CMD = [ | |
*'wget --no-clobber --page-requisites --adjust-extension --convert-links --no-parent'.split(' '), | |
link['url'], | |
] | |
try: | |
proc = run(CMD, stdout=DEVNULL, stderr=DEVNULL, cwd=out_dir, timeout=20) # dom.html | |
except Exception as e: | |
print(' Exception: {}'.format(e.__class__.__name__)) | |
else: | |
print(' β Skipping site download') | |
# download PDF | |
if not os.path.exists('{}/output.pdf'.format(out_dir)) or overwrite: | |
print(' - Printing PDF') | |
CMD = 'google-chrome --headless --disable-gpu --print-to-pdf'.split(' ') | |
try: | |
proc = run([*CMD, link['url']], stdout=DEVNULL, stderr=DEVNULL, cwd=out_dir, timeout=20) # output.pdf | |
except Exception as e: | |
print(' Exception: {}'.format(e.__class__.__name__)) | |
else: | |
print(' β Skipping PDF print') | |
# take screenshot | |
if not os.path.exists('{}/screenshot.png'.format(out_dir)) or overwrite: | |
print(' - Snapping Screenshot') | |
CMD = 'google-chrome --headless --disable-gpu --screenshot'.split(' ') | |
try: | |
proc = run([*CMD, '--window-size={}'.format(RESOLUTION), link['url']], stdout=DEVNULL, stderr=DEVNULL, cwd=out_dir, timeout=20) # sreenshot.png | |
except Exception as e: | |
print(' Exception: {}'.format(e.__class__.__name__)) | |
else: | |
print(' β Skipping screenshot') | |
# download favicon | |
if not os.path.exists('{}/favicon.ico'.format(out_dir)) or overwrite: | |
print(' - Fetching Favicon') | |
CMD = 'curl https://www.google.com/s2/favicons?domain={domain}'.format(**link).split(' ') | |
fout = open('{}/favicon.ico'.format(out_dir), 'w') | |
try: | |
proc = run([*CMD], stdout=fout, stderr=DEVNULL, cwd=out_dir, timeout=20) # dom.html | |
except Exception as e: | |
print(' Exception: {}'.format(e.__class__.__name__)) | |
fout.close() | |
else: | |
print(' β Skipping favicon') | |
run(['chmod', '-R', '755', out_dir], timeout=1) | |
def create_archive(pocket_file, resume=None): | |
print('[+] [{}] Starting pocket archive from {}'.format(datetime.now(), pocket_file)) | |
if not os.path.exists('pocket'): | |
os.makedirs('pocket') | |
if not os.path.exists('pocket/archive'): | |
os.makedirs('pocket/archive') | |
with open(pocket_file, 'r', encoding='utf-8') as f: | |
links = parse_pocket_export(f) | |
links = list(reversed(sorted(links, key=lambda l: l['timestamp']))) # most recent first | |
if resume: | |
links = [link for link in links if link['timestamp'] >= resume] | |
if not links: | |
print('[X] No links found in {}'.format(pocket_file)) | |
raise SystemExit(1) | |
dump_index(links) | |
run(['chmod', '-R', '755', 'pocket'], timeout=1) | |
print('[*] [{}] Created archive index.'.format(datetime.now())) | |
for link in links: | |
dump_website(link) | |
print('[β] [{}] Archive complete.'.format(datetime.now())) | |
if __name__ == '__main__': | |
pocket_file = 'ril_export.html' | |
resume = None | |
try: | |
pocket_file = sys.argv[2] # path to pocket export html | |
resume = sys.argv[3] # timestamp to resume at | |
except IndexError: | |
pass | |
create_archive(pocket_file, resume=resume) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment