FeiSun/description.md

## description.md

      
    Raw
  

              description.md
            
          
    Use this to automatically scrape all of your saved Instapaper articles locally as HTML and PDF files. I originally wrote this to read my saved documents on my reMarkable tablet. Instapaper does not have an option to export all my stuff as PDF as far as I could tell (the built-in options only export a subset).
You will need to have the following packages installed:

requests
Beautiful Soup
pdfkit

Configure your username and password, then run the script. It will go through all articles shown on your home page and download the copy Instapaper has stored into a folder called output as HTML file and convert it into a PDF. You can customize the look by updating the included styles.css file. Any errors will be reported and logged to failed.txt. Errors might be due to parsing errors on Instapaper's side or due to PDF conversion issues. I am not sure on how to fix the first, but the script will retry a couple of times in the second case.
On my computer this took about 1-10 seconds per article and up to a minute when PDFs needed to be remade. You might need to quit and rerun if it takes longer than that. It should resume from where it left off if it detects a PDF has already been created. This was written in an afternoon, apologies for any issues.

  
## scrape.py
#!/usr/bin/env python
# coding: utf-8

import requests
import time
from bs4 import BeautifulSoup
import pdfkit
import os

s = requests.Session()
s.post("https://www.instapaper.com/user/login", data={
    "username": "YOUR_USERNAME",
    "password": "YOUR_PASSWORD",
    "keep_logged_in": "yes"
})
base = "./output/"


def get_ids(page=1):
    r = s.get("https://www.instapaper.com/u/" + str(page))
    soup = BeautifulSoup(r.text, "html.parser")

    articles = soup.find(id="article_list").find_all("article")
    ids = [i["id"].replace("article_", "") for i in articles]
    has_more = soup.find(class_="paginate_older") is not None
    return ids, has_more


def get_article(id):
    r = s.get("https://www.instapaper.com/read/" + str(id))
    soup = BeautifulSoup(r.text, "html.parser")

    title = soup.find(id="titlebar").find("h1").getText()
    origin = soup.find(id="titlebar").find(class_="origin_line").getText()
    content = soup.find(id="story").decode_contents()
    return {
        "title": title.strip(),
        "origin": origin.strip(),
        "content": content.strip()
    }


def article_converted(id):
    for file_name in os.listdir(base):
        if file_name.startswith(id) and file_name.endswith(".pdf"):
            return base + os.path.basename(file_name)
    return None


def download_article(id):
    article = get_article(id)
    file_name = id + " " + article["title"]
    file_name = "".join([c for c in file_name if c.isalpha()
                         or c.isdigit() or c == " "]).rstrip()
    file_name = base + file_name + ".html"

    with open(file_name, "w") as file:
        file.write("<h1>%s</h1>" % (article["title"]))
        file.write("<div id='origin'>%s · %s</div>" % (article["origin"], id))
        file.write(article["content"])

    return file_name


def convert_to_pdf(file_name):
    new_name = file_name[:-5] + ".pdf"
    margin = "0.75in"
    options = {
        "page-size": "Letter",
        "margin-top": margin,
        "margin-right": margin,
        "margin-bottom": margin,
        "margin-left": margin,
        "encoding": "UTF-8",
        "no-outline": None,
        "user-style-sheet": "styles.css",
        "load-error-handling": "ignore",
        "quiet": "",
    }

    pdfkit.from_file(file_name, new_name, options=options)
    return new_name


has_more = True
page = 1

failure_log = open("failed.txt", "a+")

while has_more:
    print("Page " + str(page))
    ids, has_more = get_ids(page)
    for id in ids:
        print("  " + id + ": ", end="")
        existing_file = article_converted(id)
        if existing_file:
            print("exists")
        else:
            start = time.time()
            try:
                file_name = download_article(id)
            except Exception as e:
                print("failed!")
                print(e)
                failure_log.write("%s\t%s\n" % (id, str(e)))
                failure_log.flush()
                continue
            retries = 10
            while True:
                try:
                    convert_to_pdf(file_name)
                except Exception as e:
                    retries -= 1
                    if retries < 0:
                        print("failed!")
                        print(e)
                        failure_log.write("%s\t%s\n" % (id, str(e)))
                        failure_log.flush()
                        break
                    continue
                break
            duration = time.time() - start
            print(str(round(duration, 2)) + " seconds")
            if duration < 1:  # wait a second
                time.sleep(1 - duration)
    page += 1

## styles.css
html,
body {
	margin: 0;
	padding: 0;
}

html {
	font-family: Georgia, "Times New Roman", Times, serif;
	font-size: 20px;
	line-height: 1.5;
	word-wrap: break-word;
	-webkit-hyphens: none;
	-ms-hyphens: none;
	hyphens: none;
	padding: 1em;
	background: white;
	color: black;
}

#origin {
	opacity: 0.6;
	margin-bottom: 2em;
}

a {
	text-decoration: none;
	border-bottom: 2px solid;
	color: inherit;
}

a:after {
	content: " (" attr(href) ")";
	border-bottom: 2px solid white;
	color: rgba(0, 0, 0, 0.6);
}

img {
	max-width: 100%;
}

pre,
code {
	font-size: 16px;
	line-height: 1.1em;
	font-family: "Monaco", monospace;
}

li p:first-child {
	margin: 0;
}
	#!/usr/bin/env python
	# coding: utf-8

	import requests
	import time
	from bs4 import BeautifulSoup
	import pdfkit
	import os

	s = requests.Session()
	s.post("https://www.instapaper.com/user/login", data={
	"username": "YOUR_USERNAME",
	"password": "YOUR_PASSWORD",
	"keep_logged_in": "yes"
	})
	base = "./output/"


	def get_ids(page=1):
	r = s.get("https://www.instapaper.com/u/" + str(page))
	soup = BeautifulSoup(r.text, "html.parser")

	articles = soup.find(id="article_list").find_all("article")
	ids = [i["id"].replace("article_", "") for i in articles]
	has_more = soup.find(class_="paginate_older") is not None
	return ids, has_more


	def get_article(id):
	r = s.get("https://www.instapaper.com/read/" + str(id))
	soup = BeautifulSoup(r.text, "html.parser")

	title = soup.find(id="titlebar").find("h1").getText()
	origin = soup.find(id="titlebar").find(class_="origin_line").getText()
	content = soup.find(id="story").decode_contents()
	return {
	"title": title.strip(),
	"origin": origin.strip(),
	"content": content.strip()
	}


	def article_converted(id):
	for file_name in os.listdir(base):
	if file_name.startswith(id) and file_name.endswith(".pdf"):
	return base + os.path.basename(file_name)
	return None


	def download_article(id):
	article = get_article(id)
	file_name = id + " " + article["title"]
	file_name = "".join([c for c in file_name if c.isalpha()
	or c.isdigit() or c == " "]).rstrip()
	file_name = base + file_name + ".html"

	with open(file_name, "w") as file:
	file.write("<h1>%s</h1>" % (article["title"]))
	file.write("<div id='origin'>%s · %s</div>" % (article["origin"], id))
	file.write(article["content"])

	return file_name


	def convert_to_pdf(file_name):
	new_name = file_name[:-5] + ".pdf"
	margin = "0.75in"
	options = {
	"page-size": "Letter",
	"margin-top": margin,
	"margin-right": margin,
	"margin-bottom": margin,
	"margin-left": margin,
	"encoding": "UTF-8",
	"no-outline": None,
	"user-style-sheet": "styles.css",
	"load-error-handling": "ignore",
	"quiet": "",
	}

	pdfkit.from_file(file_name, new_name, options=options)
	return new_name


	has_more = True
	page = 1

	failure_log = open("failed.txt", "a+")

	while has_more:
	print("Page " + str(page))
	ids, has_more = get_ids(page)
	for id in ids:
	print(" " + id + ": ", end="")
	existing_file = article_converted(id)
	if existing_file:
	print("exists")
	else:
	start = time.time()
	try:
	file_name = download_article(id)
	except Exception as e:
	print("failed!")
	print(e)
	failure_log.write("%s\t%s\n" % (id, str(e)))
	failure_log.flush()
	continue
	retries = 10
	while True:
	try:
	convert_to_pdf(file_name)
	except Exception as e:
	retries -= 1
	if retries < 0:
	print("failed!")
	print(e)
	failure_log.write("%s\t%s\n" % (id, str(e)))
	failure_log.flush()
	break
	continue
	break
	duration = time.time() - start
	print(str(round(duration, 2)) + " seconds")
	if duration < 1: # wait a second
	time.sleep(1 - duration)
	page += 1
	html,
	body {
	margin: 0;
	padding: 0;
	}

	html {
	font-family: Georgia, "Times New Roman", Times, serif;
	font-size: 20px;
	line-height: 1.5;
	word-wrap: break-word;
	-webkit-hyphens: none;
	-ms-hyphens: none;
	hyphens: none;
	padding: 1em;
	background: white;
	color: black;
	}

	#origin {
	opacity: 0.6;
	margin-bottom: 2em;
	}

	a {
	text-decoration: none;
	border-bottom: 2px solid;
	color: inherit;
	}

	a:after {
	content: " (" attr(href) ")";
	border-bottom: 2px solid white;
	color: rgba(0, 0, 0, 0.6);
	}

	img {
	max-width: 100%;
	}

	pre,
	code {
	font-size: 16px;
	line-height: 1.1em;
	font-family: "Monaco", monospace;
	}

	li p:first-child {
	margin: 0;
	}