Skip to content

Instantly share code, notes, and snippets.

@domingogallardo
Forked from jaflo/description.md
Last active August 10, 2023 16:04
Show Gist options
  • Save domingogallardo/b6e615fce5f552db6b36261c96bd8d47 to your computer and use it in GitHub Desktop.
Save domingogallardo/b6e615fce5f552db6b36261c96bd8d47 to your computer and use it in GitHub Desktop.
Export Instapaper to HTML

Use this to automatically scrape all of your saved Instapaper articles locally as HTML files.

You will need to have the following packages installed:

Configure your username and password, then run the script. It will go through all articles shown on your home page and download the copy Instapaper has stored into a folder called output as HTML file. Any errors will be reported and logged to failed.txt.

Forked and modified from: jaflo. Thanks for sharing the original gist.

Modifications to the original gist:

  • Deleted the PDF conversion
  • Link to the original page is included in the article
  • Filenames are truncated to 200 characters length
#!/usr/bin/env python
# coding: utf-8
import requests
import time
from bs4 import BeautifulSoup
s = requests.Session()
s.post("https://www.instapaper.com/user/login", data={
"username": "YOUR_USERNAME",
"password": "YOUR_PASSWORD",
"keep_logged_in": "yes"
})
base = "./output/"
def get_ids(page=1):
r = s.get("https://www.instapaper.com/u/" + str(page))
print(r.url)
soup = BeautifulSoup(r.text, "html.parser")
articles = soup.find(id="article_list").find_all("article")
ids = [i["id"].replace("article_", "") for i in articles]
has_more = soup.find(class_="paginate_older") is not None
return ids, has_more
def get_article(id):
r = s.get("https://www.instapaper.com/read/" + str(id))
soup = BeautifulSoup(r.text, "html.parser")
title = soup.find(id="titlebar").find("h1").getText()
origin = soup.find(id="titlebar").find(class_="origin_line")
content = soup.find(id="story").decode_contents()
return {
"title": title.strip(),
"origin": origin,
"content": content.strip()
}
# Function to truncate file name
def truncate_filename(name, extension, max_length=200):
total_length = len(name) + len(extension) + 1 # +1 for the dot in ".html"
if total_length > max_length:
name = name[:max_length - len(extension) - 1] # Truncate the excess
return name + extension
def download_article(id):
article = get_article(id)
file_name = id + " " + article["title"]
file_name = "".join([c for c in file_name if c.isalpha()
or c.isdigit() or c == " "]).rstrip()
file_name = base + truncate_filename(file_name, ".html")
print(file_name)
with open(file_name, "w") as file:
file.write("<!DOCTYPE html>\n<html>\n<head>\n<meta charset=\"UTF-8\">\n")
file.write("<title>%s</title>" % (article["title"]))
file.write("</head>\n<body>")
file.write("<h1>%s</h1>" % (article["title"]))
file.write("<div id='origin'>%s · %s</div>" % (article["origin"], id))
file.write(article["content"])
file.write("</body>\n</html>")
return file_name
has_more = True
page = 1
failure_log = open("failed.txt", "a+")
while has_more:
print("Page " + str(page))
ids, has_more = get_ids(page)
for id in ids:
print(" " + id + ": ", end="")
start = time.time()
try:
file_name = download_article(id)
except Exception as e:
print(f"failed!")
print(e)
failure_log.write("%s\t%s\n" % (id, str(e)))
failure_log.flush()
else:
duration = time.time() - start
print(str(round(duration, 2)) + " seconds")
page += 1
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment