paulegan/_INSTAPAPER TO FEEDLY_ README.md

## _INSTAPAPER TO FEEDLY_ README.md

      
    Raw
  

              _INSTAPAPER TO FEEDLY_ README.md
            
          
    Get a Feedly developer token from here:

https://developer.feedly.com/v3/developer/
Get a Mercury API key from here (used to extract metadata on articles):

https://mercury.postlight.com/web-parser/
Download your Instapaper data here:

https://www.instapaper.com/user
Create a new board in Feedly and record the tag ID from the URL, e.g.

https://feedly.com/i/tag/user/XXX/tag/Instapaper
(take user/XXX/tag/Instapaper)
Run this script with the following parameters:
python3 import_instapaper_to_feedly.py --feedly-token XXX --mercury-token XXX --feedly-tag-id user/XXX/tag/Instapaper instapaper-export.csv


## import_instapaper_to_feedly.py
import time
import csv
import dateutil.parser
import requests
import logging
import argparse


def read_instapaper_export(filename):
    with open(filename) as f:
        for row in csv.reader(f):
            if row[0].startswith('http'):
                yield row


def parse_article_content(url, args):
    params = {'url': url}
    headers = {'x-api-key': args.mercury_token}
    response = requests.get('https://mercury.postlight.com/parser', params=params, headers=headers)
    response.raise_for_status()
    content = response.json()
    if 'title' not in content and content.get('failed'):
        raise Exception(content['messages'])
    content['url'] = url    # force url back to original
    try:
        content['published'] = int(dateutil.parser.parse(content['date_published']).strftime('%s'))
    except (KeyError, TypeError, ValueError):
        content['published'] = None
    return content


def create_feedly_entry(content, args):
    timestamp = content['published'] * 1000 if content['published'] else None
    entry = {
        'published': timestamp,
        'crawled': timestamp,
        'updated': timestamp,
        'origin': {'htmlUrl': content['url'], 'title': content['title']},
        'title': content['title'],
        'author': content['author'],
        'summary': {'content': content['excerpt']},
        'content': {'content': content['content']},
        'enclosure': [{'href': content['lead_image_url'], 'type': 'image/jpeg'}],
        'alternate': [{'href': content['url'], 'type': 'text/html'}],
        'tags': [{'id': args.feedly_tag_id, 'label': args.feedly_tag_label}],
        'createdBy': {'application': 'Instapaper', 'userAgent': 'import_instapaper_to_feedly'},
    }
    headers = {'Authorization': 'OAuth ' + args.feedly_token}
    while True:
        response = requests.post('https://cloud.feedly.com/v3/entries/', headers=headers, json=entry)
        if response.status_code == 429:     # respect rate limiting
            sleeptime = int(response.headers['X-RateLimit-Reset'])
            logging.warning('Hit Feedly rate limit: Sleeping %ss', sleeptime)
            time.sleep(sleeptime)
        else:
            break
    response.raise_for_status()
    return response.json()


def run_import(args):
    records = reversed(list(read_instapaper_export(args.instapaper_file)))
    for url, title, selection, folder in records:
        if args.instapaper_folder and args.instapaper_folder != folder:
            continue
        if args.filter_string and args.filter_string not in url:
            continue

        try:
            content = parse_article_content(url, args)
        except Exception:
            logging.warning('Unable to parse article content for %s, creating reference only', url)
            content = {
                'url': url,
                'title': title,
                'excerpt': title or url,
                'content': title,
                'author': None,
                'published': None,
                'lead_image_url': None,
            }

        try:
            create_feedly_entry(content, args)
        except KeyboardInterrupt:
            break
        except Exception as e:
            logging.error('Error importing %s: %s', url, e)
        else:
            logging.info('Imported %s', url)


if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--feedly-token', required=True, help='Feedly developer access token')
    parser.add_argument('--feedly-tag-id', required=True, help='ID of Feedly tag/board, e.g. user/xxx/tag/Instapaper')
    parser.add_argument('--feedly-tag-label', default='Instapaper', help='Label for Feedly tag/board')
    parser.add_argument('--mercury-token', required=True, help='API Key for Mercury Web Parser')
    parser.add_argument('--instapaper-folder', help='Import only those entries from the specified Instapaper folder')
    parser.add_argument('--filter-string', help='Import only those entries which contain the specified string in the URL')
    parser.add_argument('instapaper_file')
    args = parser.parse_args()

    logging.basicConfig(level=logging.INFO)
    run_import(args)
	import time
	import csv
	import dateutil.parser
	import requests
	import logging
	import argparse


	def read_instapaper_export(filename):
	with open(filename) as f:
	for row in csv.reader(f):
	if row[0].startswith('http'):
	yield row


	def parse_article_content(url, args):
	params = {'url': url}
	headers = {'x-api-key': args.mercury_token}
	response = requests.get('https://mercury.postlight.com/parser', params=params, headers=headers)
	response.raise_for_status()
	content = response.json()
	if 'title' not in content and content.get('failed'):
	raise Exception(content['messages'])
	content['url'] = url # force url back to original
	try:
	content['published'] = int(dateutil.parser.parse(content['date_published']).strftime('%s'))
	except (KeyError, TypeError, ValueError):
	content['published'] = None
	return content


	def create_feedly_entry(content, args):
	timestamp = content['published'] * 1000 if content['published'] else None
	entry = {
	'published': timestamp,
	'crawled': timestamp,
	'updated': timestamp,
	'origin': {'htmlUrl': content['url'], 'title': content['title']},
	'title': content['title'],
	'author': content['author'],
	'summary': {'content': content['excerpt']},
	'content': {'content': content['content']},
	'enclosure': [{'href': content['lead_image_url'], 'type': 'image/jpeg'}],
	'alternate': [{'href': content['url'], 'type': 'text/html'}],
	'tags': [{'id': args.feedly_tag_id, 'label': args.feedly_tag_label}],
	'createdBy': {'application': 'Instapaper', 'userAgent': 'import_instapaper_to_feedly'},
	}
	headers = {'Authorization': 'OAuth ' + args.feedly_token}
	while True:
	response = requests.post('https://cloud.feedly.com/v3/entries/', headers=headers, json=entry)
	if response.status_code == 429: # respect rate limiting
	sleeptime = int(response.headers['X-RateLimit-Reset'])
	logging.warning('Hit Feedly rate limit: Sleeping %ss', sleeptime)
	time.sleep(sleeptime)
	else:
	break
	response.raise_for_status()
	return response.json()


	def run_import(args):
	records = reversed(list(read_instapaper_export(args.instapaper_file)))
	for url, title, selection, folder in records:
	if args.instapaper_folder and args.instapaper_folder != folder:
	continue
	if args.filter_string and args.filter_string not in url:
	continue

	try:
	content = parse_article_content(url, args)
	except Exception:
	logging.warning('Unable to parse article content for %s, creating reference only', url)
	content = {
	'url': url,
	'title': title,
	'excerpt': title or url,
	'content': title,
	'author': None,
	'published': None,
	'lead_image_url': None,
	}

	try:
	create_feedly_entry(content, args)
	except KeyboardInterrupt:
	break
	except Exception as e:
	logging.error('Error importing %s: %s', url, e)
	else:
	logging.info('Imported %s', url)


	if __name__ == '__main__':
	parser = argparse.ArgumentParser()
	parser.add_argument('--feedly-token', required=True, help='Feedly developer access token')
	parser.add_argument('--feedly-tag-id', required=True, help='ID of Feedly tag/board, e.g. user/xxx/tag/Instapaper')
	parser.add_argument('--feedly-tag-label', default='Instapaper', help='Label for Feedly tag/board')
	parser.add_argument('--mercury-token', required=True, help='API Key for Mercury Web Parser')
	parser.add_argument('--instapaper-folder', help='Import only those entries from the specified Instapaper folder')
	parser.add_argument('--filter-string', help='Import only those entries which contain the specified string in the URL')
	parser.add_argument('instapaper_file')
	args = parser.parse_args()

	logging.basicConfig(level=logging.INFO)
	run_import(args)