Skip to content

Instantly share code, notes, and snippets.

@RuizSerra
Last active September 10, 2022 09:02
Show Gist options
  • Save RuizSerra/e463bf9d79166825379cdba8c6d9b251 to your computer and use it in GitHub Desktop.
Save RuizSerra/e463bf9d79166825379cdba8c6d9b251 to your computer and use it in GitHub Desktop.
PaperSpan to Instapaper
"""
Given exports from PaperSpan, Diigo, Pocket, format for import to Instapaper.
Run the script, browse to https://www.instapaper.com/user and select "Import from Instapaper CSV"
"""
import datetime
import csv
import re
OUTPUT_CSV = '/Users/foobar/Downloads/instapaper-import.csv'
# ----------------------------------------------------------------------------
INPUT_PAPERSPAN = '/Users/foobar/Downloads/ps_export.html'
with open(INPUT_PAPERSPAN, 'r') as f:
html = f.readlines()
entries = [l.strip() for l in html if '<li><a ' in l]
entries = [l for l in entries if 'time_read' not in l]
pat = re.compile(r'<li><a href="(.+?)" time_added="(\d+?)">(.+?)<\/a><\/li>')
output = []
for l in entries:
result = pat.match(l)
if not result:
print(l)
continue
url, timestamp, title = result.groups()
output.append({'URL': url, 'Title': title, 'Selection': '',
'Folder': 'Imported', 'Timestamp': timestamp})
print(f'Formatted {len(output)} entries.')
# ----------------------------------------------------------------------------
INPUT_DIIGO = '/Users/jaime/Downloads/diigo-export.csv'
output = []
with open(INPUT_DIIGO, 'r') as f:
reader = csv.DictReader(f)
for r in reader:
output.append({'URL': r['url'], 'Title': r['title'],
'Selection': '', 'Folder': 'Imported',
'Timestamp': int(datetime.datetime.timestamp(
datetime.datetime.now())*1000)}
)
print(f'Formatted {len(output)} entries.')
# ----------------------------------------------------------------------------
INPUT_POCKET = '/Users/jaime/Downloads/pocket-export.html'
with open(INPUT_POCKET, 'r') as f:
html = f.readlines()
entries = [l.strip() for l in html]
pat_read = re.compile(r'<h1>Read Archive</h1>')
pat = re.compile(r'<li><a href="(.+?)" time_added="(\d+?)" tags="(.*)">(.+?)<\/a><\/li>')
folder = 'Imported'
output = []
for l in entries:
if pat_read.match(l):
folder = 'Archive'
result = pat.match(l)
if not result:
print('NO MATCH', l)
continue
url, timestamp, tags, title = result.groups()
output.append({'URL': url, 'Title': title, 'Selection': '', 'Folder': folder, 'Timestamp': timestamp})
print(f'Formatted {len(output)} entries.')
# ----------------------------------------------------------------------------
with open(OUTPUT_CSV, 'w') as f:
w = csv.DictWriter(f, output[0].keys())
w.writeheader()
w.writerows(output)
print(f'Formatting complete. See output file {OUTPUT_CSV}')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment