- start the server locally
- login into your penzu account
- execute script in chrome->dev tools->sources->snippets
it fetches all the data using your current session and sends it to local server, which saves everything in export
folder
export | |
.env |
from flask import Flask, make_response, request | |
import os | |
import re,datetime | |
BASE_FOLDER = os.path.join(os.path.dirname(__file__), 'export') | |
app = Flask(__name__) | |
def isostr_to_date(s): | |
return datetime.datetime(*map(int, re.split('[^\d]', s)[:-1])) | |
@app.route("/save", methods=['GET', 'POST', 'OPTIONS']) | |
def save(): | |
response = make_response('success') | |
if request.method == 'POST': | |
data = request.form | |
ftype = data.get('ftype') or 'html' | |
title = data.get('title') | |
sdate = data.get('date') | |
if sdate: | |
d = isostr_to_date(sdate) | |
sdate = d.strftime('%Y-%m-%d') | |
title = '%s-%s' % (sdate, title) | |
body = data.get('body') | |
if ftype == 'html': | |
body = convert_to_html(body) | |
ftype = 'txt' | |
title = re.sub(r'[\s,-\.\/\?\']+', '_', title) | |
name = '%s.%s' % ( title, ftype) | |
path = os.path.join(BASE_FOLDER, name) | |
with open(path, 'wb') as f: | |
f.write(data.get('title').encode('utf8') + '\n\n') | |
f.write(body.encode('utf8')) | |
elif request.method == 'OPTIONS': | |
#do nothing just return Access Control headers | |
pass | |
else: | |
return 'use POST method to save files' | |
response.headers['Access-Control-Allow-Origin'] = '*' #https://penzu.com' | |
response.headers['Access-Control-Allow-Methods'] = 'POST' | |
response.headers['Access-Control-Allow-Headers'] = "Origin, X-Requested-With, Content-Type, Accept" | |
return response | |
def convert_to_html(s): | |
s = re.sub(r'<br\s*\/?>', '\n', s) | |
for tag in ['p', 'div']: | |
s = re.sub('<\/?%s[^>]*>' % tag, '\n', s) | |
return update_html_unicode(s) | |
import HTMLParser | |
def update_html_unicode(s): | |
regexp = "&.+?;" | |
list_of_html = re.findall(regexp, s) #finds all html entites in page | |
parser = HTMLParser.HTMLParser() | |
for e in list_of_html: | |
#for unknown reason parser.unescape didn't work for some characters on my machine | |
#although, it should ... maybe has something to do with my platform or specific version | |
char_code = e[2:-1] | |
if char_code[0] == 'x': | |
raise NotImplementedError('not implemented for hex numbers') | |
try: | |
unescaped = unichr(int(char_code, 10)) | |
except ValueError: | |
unescaped = parser.unescape(e) | |
s = s.replace(e, unescaped) | |
return s | |
if __name__ == "__main__": | |
app.run(debug=True) |
BeautifulSoup==3.2.1 | |
Flask==0.10.1 | |
Jinja2==2.7.2 | |
MarkupSafe==0.18 | |
Werkzeug==0.9.4 | |
argparse==1.2.1 | |
itsdangerous==0.23 | |
wsgiref==0.1.2 |
(function(){ | |
var PENZU = { | |
link_template: '/pad/load_entry/', | |
links: [], | |
export: function(){ | |
//this.exportPage($('body')); | |
var pages = $("a.page, span.current.lmarg").map(function(n,p) { return parseInt($(p).text()) }); | |
pages = $.makeArray(pages); | |
var self = this; | |
_fetchLinks(); | |
function _fetchLinks(err){ | |
if(pages.length){ | |
self.linksForPage(pages.pop(), _fetchLinks); | |
}else{ | |
self.processLinks(self.links); | |
} | |
} | |
}, | |
linksForPage: function(pageNumber, cb){ | |
console.log('loading links for page %d', pageNumber); | |
var self = this; | |
$.ajax({ | |
url: '/entries/entries', | |
data: {'page': pageNumber}, | |
type: 'GET', | |
dataType: 'json', | |
success: function(res){ | |
var page = $(res); | |
var links = res.entries.map(function(e){ | |
return self.link_template + e.id; | |
}); | |
self.links = $.merge(self.links, links); | |
cb(null) | |
//self.exportPage(page); | |
}, | |
error: function(){ | |
console.error('linksForPage error'); | |
cb('error in linksForPage ' + pageNumber) | |
} | |
}); | |
}, | |
processLinks: function(links){ | |
//var links = getLinks(page); | |
console.log("found %d links", links.length) | |
_run(links.pop()); | |
function _run(link){ | |
console.log('processing link %s, number of links left: %d', link, links.length) | |
extractArticle(link, function(er, data){ | |
saveToFile(data, function(err){ | |
if(err){ | |
console.error(err); | |
} | |
else if(links.length){ | |
_run(links.pop()); | |
} | |
}); | |
}); | |
} | |
} | |
} | |
PENZU.export() | |
// function getLinks(page){ | |
// var links = page.find(".etitles a").map(function(){ | |
// return $(this).attr('href'); | |
// }); | |
// return $.makeArray(links); | |
// } | |
function extractArticle(link, cb){ | |
$.get(link, function(res){ | |
var el = $(res) | |
var d = new Date(el.find('#entry_date_url').text()) | |
var data = { | |
date: d.toISOString(), | |
title: el.find('#entry_title').val(), | |
body: el.find('#entry_body').text(), | |
ftype: 'html', | |
source: 'penzu' | |
} | |
cb(null, data); | |
//console.log(data) | |
}); | |
} | |
function saveToFile(data, cb){ | |
$.ajax({ | |
crossDomain: true, | |
url: 'http://localhost:5000/save', | |
type: 'POST', | |
data: data, | |
//dataType: 'json', | |
success: function(){ | |
console.log('saved'); | |
cb(null); | |
}, | |
error: function(er, x){ | |
console.warn(x); | |
cb('cannot save data') | |
} | |
}); | |
} | |
})(); |
You can't extract the title, mabe because it is a textarea. So instead I had to extract the previous title from the 'previous' link at the bottom. Then I shifted the column in post processing.
To anyone who come here to export data from Penzu.
You can easily use Data Miner Chrome extension for that.
https://chrome.google.com/webstore/detail/data-scraper-easy-web-scr/nndknepjnldbdbepjfgmncbggmopgden