-
-
Save kspeeckaert/934a8308a4ec09e2e0756a313a915622 to your computer and use it in GitHub Desktop.
import json | |
import uuid | |
from pathlib import Path | |
from urllib.parse import urlparse | |
from datetime import datetime | |
import html2text | |
import requests | |
from bs4 import BeautifulSoup | |
source_url = 'http://tomaugspurger.github.io/modern-5-tidy.html' | |
IP_URL = 'http://www.instapaper.com/text?u={url}' | |
QVR_NOTEBOOK = '/Users/kristof/Dropbox/Applications/Quiver/Quiver.qvlibrary/F54CCC03-A5EC-48E7-8DCD-A264ABCC4277.qvnotebook' | |
# Download the images and generate UUIDs | |
def localize_images(resource_path, img_tags): | |
for img_tag in img_tags: | |
url = img_tag['src'] | |
r = requests.get(url) | |
# Define the extension and the new filename | |
img_ext = Path(urlparse(url).path).suffix | |
img_name = '{}{}'.format(uuid.uuid4().hex.upper(), | |
img_ext) | |
img_filename = Path(resource_path, img_name) | |
with open(str(img_filename), 'wb') as f: | |
f.write(r.content) | |
# Convert the original URL to a Quiver URL | |
img_tag['src'] = 'quiver-image-url/{}'.format(img_name) | |
# Write content.json | |
def write_content(note_path, note_title, note_text): | |
qvr_content = {} | |
qvr_content['title'] = note_title | |
qvr_content['cells'] = [] | |
cell = {'type': 'markdown', | |
'data': note_text} | |
qvr_content['cells'].append(cell) | |
with open(str(Path(note_path, 'content.json')), 'w') as f: | |
f.write(json.dumps(qvr_content)) | |
# Write meta.json | |
def write_meta(note_path, note_title, note_uuid): | |
timestamp = int(datetime.timestamp(datetime.now())) | |
qvr_meta = {} | |
qvr_meta['title'] = note_title | |
qvr_meta['uuid'] = note_uuid | |
qvr_meta['created_at'] = timestamp | |
qvr_meta['updated_at'] = timestamp | |
with open(str(Path(note_path, 'meta.json')), 'w') as f: | |
f.write(json.dumps(qvr_meta)) | |
# Download the IP version of the URL | |
r = requests.get(IP_URL.format(url=source_url)) | |
r.raise_for_status() | |
bs = BeautifulSoup(r.content, 'lxml') | |
qvr_note_uuid = str(uuid.uuid4()).upper() | |
# Create the folders | |
paths = {} | |
paths['notebook'] = QVR_NOTEBOOK | |
paths['note'] = Path(paths['notebook'], '{}.qvnote'.format(qvr_note_uuid)) | |
paths['resources'] = Path(paths['note'], 'resources') | |
paths['resources'].mkdir(parents=True, exist_ok=True) | |
# Replace the original links by the quiver links | |
localize_images(paths['resources'], bs.find_all('img')) | |
# Remove title | |
_ = bs.select('body main > div.titlebar')[0].extract() | |
# Convert to Markdown | |
parser = html2text.HTML2Text() | |
parser.protect_links = True | |
parser.wrap_links = False | |
parser.body_width = 0 | |
note_text = parser.handle(str(bs.find('main'))) | |
write_content(paths['note'], | |
bs.head.title.string, | |
note_text) | |
write_meta(paths['note'], | |
bs.head.title.string, | |
qvr_note_uuid) |
OK, the can't find notebook problem was because I didn't put the notebook UUID in the notebook path. duh. Password and URL solutions to follow.
To login in, replace the r=requests.get() line with the below. Note, no error checking on the login:
IP_LoginPage = 'https://www.instapaper.com/user/login'
values = {'username': 'Your UserName','password': 'Your Password'}
with requests.Session() as s:
# ToDo: Error checking
r = s.post( IP_LoginPage, data=values) # login
r = s.get(IP_URL.format(url=source_url)) # the request`
And to heave the current URL from the current tab in Firefox, the following code to replace the line that sets source_url
. Firefox isn't applescriptable so use system events to select the URL and copy contents to the clipboard. It's not ideal but it works. It should work with Safari and Chrome too if you change the tell application
line. Have fun
from osascript import osascript
Script = """
tell application "Firefox" to activate
tell application "System Events"
keystroke "l" using command down
keystroke "c" using command down
delay 1
end tell
set FrontDocumentURL to the clipboard
"""
returncode,stdout,stderr = osascript( Script )
source_url = stdout
print( "Clip from " + source_url )
So I added some stuff, changed some stuff, partly based on previous alterations I posted above plus some other stuff:
- Use notebook name instead of notebook path to specify where clipped notes go
- Create the notebook if it doesn't exist
- added login credentials for instapaper (you'll need an Instapaper account)
- Picks up url to clip from whatever page is current in Firefox. This probably isn't hard to alter for other browsers
- Satisfied my OCD by rearranging the code into a class.
The original aim was to turn this into a Firefox extension but running Python, while doable, is looking messy so for the moment it needs running from a terminal. Or you can slot it into an Alfred workflow or Quicksilver or similar, fire it using a hotkey.
You'll need Python3 and some library installs. See notes at head of script.
It's working very well for me but caveats:
- the script looks for a 'main' section in the html. Most pages have main section, some don't. If it can't find one it clips the page's body, which ends up being a bit messy but it's editable so ...
- login details for Instapaper are held in the file as plain text. This is a long way short of ideal. Suggestions welcome.
- some error checking wouldn't hurt.
Any thoughts welcome. Enjoy.
#!/usr/local/bin/python3
# Needs python 3 See https://www.python.org/downloads/release/python-361/
#
# Required libraries:
# pip3 install html2text
# pip3 install requests
# pip3 install bs4
# pip3 install lxml
#
# ToDo:
# * Some error handling:
# * from osascript call
# * in self.Soup(), login, url post & get
# * check for errors when creating notes & notebooks
# * !!! we look for a 'main' section in self.Note(). this may not exist.
# we're currently fall back to 'body' but this isn't ideal.
# Trouble is, 'main' is in no way mandatory in a page
# maybe we need a list of possible elements to try.
import sys, os, re
#import argparse, urllib, sys, os, re
import json
import uuid
from pathlib import Path
from urllib.parse import urlparse
from datetime import datetime
import time
import html2text
import requests
from bs4 import BeautifulSoup
from osascript import osascript
class QuiverSnip():
def __init__( self ):
self.QVR_LOCATION = '/Users/kimaldis/Documents/Quiver/Quiver.qvlibrary' # name of Quiver's base library package
self.QVR_NOTEBOOK_NAME = "WebClips" # name of the notebook we'll be clipping to
self.QVR_NOTEBOOK = '' # path to the notebook package
self.IP_URL = 'http://www.instapaper.com/text?u={url}' # Instapaper endpoint for html2md
self.IP_LoginPage = 'https://www.instapaper.com/user/login' # Instapaper login
self.IP_LoginCredentials = { #Note: security alert!!!
'username': 'kim.aldis@gmail.com',
'password': 'eggbert'
}
if ( not self.FindNotebook()):
# create a new notebook
print( "Couldn't find notebook" + self.QVR_NOTEBOOK_NAME)
return
self.note_uuid = str(uuid.uuid4()).upper()
self.GetFirefoxURL()
self.Soup( )
self.CreateFolders()
self.Note()
def FindNotebook( self ):
for root, dirs, files in os.walk( self.QVR_LOCATION ):
for dir in dirs:
if dir.endswith((".qvnotebook" )):
# dir is notebook package name
metaFileName = self.QVR_LOCATION + "/" + dir + "/meta.json"
if os.path.exists( metaFileName ):
file = open( metaFileName, "r")
meta = json.loads( file.read() )
name = meta['name']
file.close()
if (name == self.QVR_NOTEBOOK_NAME):
self.QVR_NOTEBOOK = self.QVR_LOCATION + "/" + dir
return True
# couldnt' find notebook so make it
print( "Creating New Notebook " + self.QVR_NOTEBOOK_NAME)
new_notebook_uuid = str(uuid.uuid4()).upper()
new_notebook_path = self.QVR_LOCATION + "/" + new_notebook_uuid + ".qvnotebook"
metadata = { 'name': self.QVR_NOTEBOOK_NAME, 'uuid' : new_notebook_uuid }
# create the notebook package
os.makedirs( new_notebook_path )
with open( new_notebook_path + "/meta.json", 'w' ) as f:
f.write( json.dumps( metadata ) )
self.QVR_NOTEBOOK = new_notebook_path
return True
return False
def GetFirefoxURL( self ):
# Todo: handle errors here
returncode, stdout, stderr = osascript( """
tell application "Firefox" to activate
tell application "System Events"
keystroke "l" using command down
keystroke "c" using command down
delay 1
end tell
set FrontDocumentURL to the clipboard
"""
)
self.source_url = stdout
# Download the images and generate UUIDs
def localize_images(self, resource_path, img_tags):
for img_tag in img_tags:
url = img_tag['src']
r = requests.get(url)
# Define the extension and the new filename
img_ext = Path(urlparse(url).path).suffix
img_name = '{}{}'.format(uuid.uuid4().hex.upper(),
img_ext)
img_filename = Path(resource_path, img_name)
with open(str(img_filename), 'wb') as f:
f.write(r.content)
# Convert the original URL to a Quiver URL
img_tag['src'] = 'quiver-image-url/{}'.format(img_name)
# Write content.json
def write_content( self, note_path, note_title ):
qvr_content = {}
qvr_content['title'] = note_title
qvr_content['cells'] = []
cell = {'type': 'markdown',
'data': self.note_text}
qvr_content['cells'].append(cell)
with open(str(Path(note_path, 'content.json')), 'w') as f:
f.write(json.dumps(qvr_content))
# Write meta.json to disk
def write_meta( self, note_path, note_title, note_uuid):
timestamp = int(datetime.timestamp(datetime.now()))
qvr_meta = {}
qvr_meta['title'] = note_title
qvr_meta['uuid'] = note_uuid
qvr_meta['created_at'] = timestamp
qvr_meta['updated_at'] = timestamp
with open(str(Path(note_path, 'meta.json')), 'w') as f:
f.write(json.dumps(qvr_meta))
def Soup( self ):
# Download the IP version of the URL
with requests.Session() as s:
# ToDo: Error checking
r = s.post( self.IP_LoginPage, data=self.IP_LoginCredentials) # login
r = s.get(self.IP_URL.format(url=self.source_url)) # the request
self.bs = BeautifulSoup(r.content, 'lxml')
return self.bs
def CreateFolders( self ):
# Create the folders
paths = {}
paths['notebook'] = self.QVR_NOTEBOOK
paths['note'] = Path(paths['notebook'], '{}.qvnote'.format(self.note_uuid))
paths['resources'] = Path(paths['note'], 'resources')
paths['resources'].mkdir(parents=True, exist_ok=True)
# Replace the original links by the quiver links
self.localize_images(paths['resources'], self.bs.find_all('img'))
self.paths = paths
return paths
def Note( self ):
# Convert to Markdown
parser = html2text.HTML2Text()
parser.protect_links = True
parser.wrap_links = False
parser.body_width = 0
note_text = parser.handle(str(self.bs.find('main')))
# if 'main' section couldn't be found, try 'body'
if ( re.search( '^None', note_text ) ):
print( "couldn't find a main tag, trying 'body'. results may be messy" )
note_text = parser.handle(str(self.bs.find('body')))
theDate = time.strftime("%d/%m/%Y at %H:%M:%S")
note_text = "Clipped From [Here](" + self.source_url + ")\n" + " on " + theDate + "\n" + note_text
self.note_text = note_text
def Save( self ):
self.write_content( self.paths['note'], self.bs.head.title.string )
self.write_meta(self.paths['note'], self.bs.head.title.string, self.note_uuid)
QS = QuiverSnip()
QS.Save()
print( "New Note title: " + QS.bs.head.title.string )
So I added some stuff, changed some stuff, partly based on previous alterations I posted above plus some other stuff:
- Use notebook name instead of notebook path to specify where clipped notes go
- Create the notebook if it doesn't exist
- added login credentials for instapaper (you'll need an Instapaper account)
- Picks up url to clip from whatever page is current in Firefox. This probably isn't hard to alter for other browsers
- Satisfied my OCD by rearranging the code into a class.
The original aim was to turn this into a Firefox extension but running Python, while doable, is looking messy so for the moment it needs running from a terminal. Or you can slot it into an Alfred workflow or Quicksilver or similar, fire it using a hotkey.
You'll need Python3 and some library installs. See notes at head of script.
It's working very well for me but caveats:
- the script looks for a 'main' section in the html. Most pages have main section, some don't. If it can't find one it clips the page's body, which ends up being a bit messy but it's editable so ...
- login details for Instapaper are held in the file as plain text. This is a long way short of ideal. Suggestions welcome.
- some error checking wouldn't hurt.
Any thoughts welcome. Enjoy.
is this still working for you? It doesn't appear to login to Instapaper for ...
Noodling around with this script I've found it failing because Instapaper requires you to be logged in. I'm also finding the Quiver can't see the newly created notebook and notes after it's run, even after re-starting Quiver.
I'll play a bit more, see if I can't get it to work because it's an idea worth pursuing. The plan is to have it grab the browser URL, probably also have it triggered by a hotkey somewhere.
It needs python3, html2text, requests, bs4 and lxml for it to work.
KYP