|
import time |
|
import csv |
|
import dateutil.parser |
|
import requests |
|
import logging |
|
import argparse |
|
|
|
|
|
def read_instapaper_export(filename): |
|
with open(filename) as f: |
|
for row in csv.reader(f): |
|
if row[0].startswith('http'): |
|
yield row |
|
|
|
|
|
def parse_article_content(url, args): |
|
params = {'url': url} |
|
headers = {'x-api-key': args.mercury_token} |
|
response = requests.get('https://mercury.postlight.com/parser', params=params, headers=headers) |
|
response.raise_for_status() |
|
content = response.json() |
|
if 'title' not in content and content.get('failed'): |
|
raise Exception(content['messages']) |
|
content['url'] = url # force url back to original |
|
try: |
|
content['published'] = int(dateutil.parser.parse(content['date_published']).strftime('%s')) |
|
except (KeyError, TypeError, ValueError): |
|
content['published'] = None |
|
return content |
|
|
|
|
|
def create_feedly_entry(content, args): |
|
timestamp = content['published'] * 1000 if content['published'] else None |
|
entry = { |
|
'published': timestamp, |
|
'crawled': timestamp, |
|
'updated': timestamp, |
|
'origin': {'htmlUrl': content['url'], 'title': content['title']}, |
|
'title': content['title'], |
|
'author': content['author'], |
|
'summary': {'content': content['excerpt']}, |
|
'content': {'content': content['content']}, |
|
'enclosure': [{'href': content['lead_image_url'], 'type': 'image/jpeg'}], |
|
'alternate': [{'href': content['url'], 'type': 'text/html'}], |
|
'tags': [{'id': args.feedly_tag_id, 'label': args.feedly_tag_label}], |
|
'createdBy': {'application': 'Instapaper', 'userAgent': 'import_instapaper_to_feedly'}, |
|
} |
|
headers = {'Authorization': 'OAuth ' + args.feedly_token} |
|
while True: |
|
response = requests.post('https://cloud.feedly.com/v3/entries/', headers=headers, json=entry) |
|
if response.status_code == 429: # respect rate limiting |
|
sleeptime = int(response.headers['X-RateLimit-Reset']) |
|
logging.warning('Hit Feedly rate limit: Sleeping %ss', sleeptime) |
|
time.sleep(sleeptime) |
|
else: |
|
break |
|
response.raise_for_status() |
|
return response.json() |
|
|
|
|
|
def run_import(args): |
|
records = reversed(list(read_instapaper_export(args.instapaper_file))) |
|
for url, title, selection, folder in records: |
|
if args.instapaper_folder and args.instapaper_folder != folder: |
|
continue |
|
if args.filter_string and args.filter_string not in url: |
|
continue |
|
|
|
try: |
|
content = parse_article_content(url, args) |
|
except Exception: |
|
logging.warning('Unable to parse article content for %s, creating reference only', url) |
|
content = { |
|
'url': url, |
|
'title': title, |
|
'excerpt': title or url, |
|
'content': title, |
|
'author': None, |
|
'published': None, |
|
'lead_image_url': None, |
|
} |
|
|
|
try: |
|
create_feedly_entry(content, args) |
|
except KeyboardInterrupt: |
|
break |
|
except Exception as e: |
|
logging.error('Error importing %s: %s', url, e) |
|
else: |
|
logging.info('Imported %s', url) |
|
|
|
|
|
if __name__ == '__main__': |
|
parser = argparse.ArgumentParser() |
|
parser.add_argument('--feedly-token', required=True, help='Feedly developer access token') |
|
parser.add_argument('--feedly-tag-id', required=True, help='ID of Feedly tag/board, e.g. user/xxx/tag/Instapaper') |
|
parser.add_argument('--feedly-tag-label', default='Instapaper', help='Label for Feedly tag/board') |
|
parser.add_argument('--mercury-token', required=True, help='API Key for Mercury Web Parser') |
|
parser.add_argument('--instapaper-folder', help='Import only those entries from the specified Instapaper folder') |
|
parser.add_argument('--filter-string', help='Import only those entries which contain the specified string in the URL') |
|
parser.add_argument('instapaper_file') |
|
args = parser.parse_args() |
|
|
|
logging.basicConfig(level=logging.INFO) |
|
run_import(args) |