RuizSerra/instapaper_import.py

## instapaper_import.py
"""
Given exports from PaperSpan, Diigo, Pocket, format for import to Instapaper.

Run the script, browse to https://www.instapaper.com/user and select "Import from Instapaper CSV"
"""

import datetime
import csv
import re

OUTPUT_CSV = '/Users/foobar/Downloads/instapaper-import.csv'

# ----------------------------------------------------------------------------
INPUT_PAPERSPAN = '/Users/foobar/Downloads/ps_export.html'

with open(INPUT_PAPERSPAN, 'r') as f:
    html = f.readlines()
entries = [l.strip() for l in html if '<li><a ' in l]
entries = [l for l in entries if 'time_read' not in l]
pat = re.compile(r'<li><a href="(.+?)" time_added="(\d+?)">(.+?)<\/a><\/li>')

output = []
for l in entries:
    result = pat.match(l)
    if not result:
        print(l)
        continue
    url, timestamp, title = result.groups()
    output.append({'URL': url, 'Title': title, 'Selection': '',
                   'Folder': 'Imported', 'Timestamp': timestamp})

print(f'Formatted {len(output)} entries.')

# ----------------------------------------------------------------------------
INPUT_DIIGO = '/Users/jaime/Downloads/diigo-export.csv'

output = []
with open(INPUT_DIIGO, 'r') as f:
    reader = csv.DictReader(f)
    for r in reader:
        output.append({'URL': r['url'], 'Title': r['title'],
                       'Selection': '', 'Folder': 'Imported',
                       'Timestamp': int(datetime.datetime.timestamp(
                                            datetime.datetime.now())*1000)}
                      )
print(f'Formatted {len(output)} entries.')

# ----------------------------------------------------------------------------
INPUT_POCKET = '/Users/jaime/Downloads/pocket-export.html'

with open(INPUT_POCKET, 'r') as f:
    html = f.readlines()

entries = [l.strip() for l in html]
pat_read = re.compile(r'<h1>Read Archive</h1>')
pat = re.compile(r'<li><a href="(.+?)" time_added="(\d+?)" tags="(.*)">(.+?)<\/a><\/li>')

folder = 'Imported'
output = []
for l in entries:

    if pat_read.match(l):
        folder = 'Archive'

    result = pat.match(l)
    if not result:
        print('NO MATCH', l)
        continue
    url, timestamp, tags, title = result.groups()
    output.append({'URL': url, 'Title': title, 'Selection': '', 'Folder': folder, 'Timestamp': timestamp})

print(f'Formatted {len(output)} entries.')

# ----------------------------------------------------------------------------
with open(OUTPUT_CSV, 'w') as f:
    w = csv.DictWriter(f, output[0].keys())
    w.writeheader()
    w.writerows(output)

print(f'Formatting complete. See output file {OUTPUT_CSV}')
	"""
	Given exports from PaperSpan, Diigo, Pocket, format for import to Instapaper.

	Run the script, browse to https://www.instapaper.com/user and select "Import from Instapaper CSV"
	"""

	import datetime
	import csv
	import re

	OUTPUT_CSV = '/Users/foobar/Downloads/instapaper-import.csv'

	# ----------------------------------------------------------------------------
	INPUT_PAPERSPAN = '/Users/foobar/Downloads/ps_export.html'

	with open(INPUT_PAPERSPAN, 'r') as f:
	html = f.readlines()
	entries = [l.strip() for l in html if '<li><a ' in l]
	entries = [l for l in entries if 'time_read' not in l]
	pat = re.compile(r'<li><a href="(.+?)" time_added="(\d+?)">(.+?)<\/a><\/li>')

	output = []
	for l in entries:
	result = pat.match(l)
	if not result:
	print(l)
	continue
	url, timestamp, title = result.groups()
	output.append({'URL': url, 'Title': title, 'Selection': '',
	'Folder': 'Imported', 'Timestamp': timestamp})

	print(f'Formatted {len(output)} entries.')

	# ----------------------------------------------------------------------------
	INPUT_DIIGO = '/Users/jaime/Downloads/diigo-export.csv'

	output = []
	with open(INPUT_DIIGO, 'r') as f:
	reader = csv.DictReader(f)
	for r in reader:
	output.append({'URL': r['url'], 'Title': r['title'],
	'Selection': '', 'Folder': 'Imported',
	'Timestamp': int(datetime.datetime.timestamp(
	datetime.datetime.now())*1000)}
	)
	print(f'Formatted {len(output)} entries.')

	# ----------------------------------------------------------------------------
	INPUT_POCKET = '/Users/jaime/Downloads/pocket-export.html'

	with open(INPUT_POCKET, 'r') as f:
	html = f.readlines()

	entries = [l.strip() for l in html]
	pat_read = re.compile(r'<h1>Read Archive</h1>')
	pat = re.compile(r'<li><a href="(.+?)" time_added="(\d+?)" tags="(.*)">(.+?)<\/a><\/li>')

	folder = 'Imported'
	output = []
	for l in entries:

	if pat_read.match(l):
	folder = 'Archive'

	result = pat.match(l)
	if not result:
	print('NO MATCH', l)
	continue
	url, timestamp, tags, title = result.groups()
	output.append({'URL': url, 'Title': title, 'Selection': '', 'Folder': folder, 'Timestamp': timestamp})

	print(f'Formatted {len(output)} entries.')

	# ----------------------------------------------------------------------------
	with open(OUTPUT_CSV, 'w') as f:
	w = csv.DictWriter(f, output[0].keys())
	w.writeheader()
	w.writerows(output)

	print(f'Formatting complete. See output file {OUTPUT_CSV}')