Skip to content

Instantly share code, notes, and snippets.

@romanlv
Last active July 6, 2023 18:30
Show Gist options
  • Save romanlv/8612140 to your computer and use it in GitHub Desktop.
Save romanlv/8612140 to your computer and use it in GitHub Desktop.
export data from online journals
  1. start the server locally
  2. login into your penzu account
  3. execute script in chrome->dev tools->sources->snippets

it fetches all the data using your current session and sends it to local server, which saves everything in export folder

from flask import Flask, make_response, request
import os
import re,datetime
BASE_FOLDER = os.path.join(os.path.dirname(__file__), 'export')
app = Flask(__name__)
def isostr_to_date(s):
return datetime.datetime(*map(int, re.split('[^\d]', s)[:-1]))
@app.route("/save", methods=['GET', 'POST', 'OPTIONS'])
def save():
response = make_response('success')
if request.method == 'POST':
data = request.form
ftype = data.get('ftype') or 'html'
title = data.get('title')
sdate = data.get('date')
if sdate:
d = isostr_to_date(sdate)
sdate = d.strftime('%Y-%m-%d')
title = '%s-%s' % (sdate, title)
body = data.get('body')
if ftype == 'html':
body = convert_to_html(body)
ftype = 'txt'
title = re.sub(r'[\s,-\.\/\?\']+', '_', title)
name = '%s.%s' % ( title, ftype)
path = os.path.join(BASE_FOLDER, name)
with open(path, 'wb') as f:
f.write(data.get('title').encode('utf8') + '\n\n')
f.write(body.encode('utf8'))
elif request.method == 'OPTIONS':
#do nothing just return Access Control headers
pass
else:
return 'use POST method to save files'
response.headers['Access-Control-Allow-Origin'] = '*' #https://penzu.com'
response.headers['Access-Control-Allow-Methods'] = 'POST'
response.headers['Access-Control-Allow-Headers'] = "Origin, X-Requested-With, Content-Type, Accept"
return response
def convert_to_html(s):
s = re.sub(r'<br\s*\/?>', '\n', s)
for tag in ['p', 'div']:
s = re.sub('<\/?%s[^>]*>' % tag, '\n', s)
return update_html_unicode(s)
import HTMLParser
def update_html_unicode(s):
regexp = "&.+?;"
list_of_html = re.findall(regexp, s) #finds all html entites in page
parser = HTMLParser.HTMLParser()
for e in list_of_html:
#for unknown reason parser.unescape didn't work for some characters on my machine
#although, it should ... maybe has something to do with my platform or specific version
char_code = e[2:-1]
if char_code[0] == 'x':
raise NotImplementedError('not implemented for hex numbers')
try:
unescaped = unichr(int(char_code, 10))
except ValueError:
unescaped = parser.unescape(e)
s = s.replace(e, unescaped)
return s
if __name__ == "__main__":
app.run(debug=True)
BeautifulSoup==3.2.1
Flask==0.10.1
Jinja2==2.7.2
MarkupSafe==0.18
Werkzeug==0.9.4
argparse==1.2.1
itsdangerous==0.23
wsgiref==0.1.2
(function(){
var PENZU = {
link_template: '/pad/load_entry/',
links: [],
export: function(){
//this.exportPage($('body'));
var pages = $("a.page, span.current.lmarg").map(function(n,p) { return parseInt($(p).text()) });
pages = $.makeArray(pages);
var self = this;
_fetchLinks();
function _fetchLinks(err){
if(pages.length){
self.linksForPage(pages.pop(), _fetchLinks);
}else{
self.processLinks(self.links);
}
}
},
linksForPage: function(pageNumber, cb){
console.log('loading links for page %d', pageNumber);
var self = this;
$.ajax({
url: '/entries/entries',
data: {'page': pageNumber},
type: 'GET',
dataType: 'json',
success: function(res){
var page = $(res);
var links = res.entries.map(function(e){
return self.link_template + e.id;
});
self.links = $.merge(self.links, links);
cb(null)
//self.exportPage(page);
},
error: function(){
console.error('linksForPage error');
cb('error in linksForPage ' + pageNumber)
}
});
},
processLinks: function(links){
//var links = getLinks(page);
console.log("found %d links", links.length)
_run(links.pop());
function _run(link){
console.log('processing link %s, number of links left: %d', link, links.length)
extractArticle(link, function(er, data){
saveToFile(data, function(err){
if(err){
console.error(err);
}
else if(links.length){
_run(links.pop());
}
});
});
}
}
}
PENZU.export()
// function getLinks(page){
// var links = page.find(".etitles a").map(function(){
// return $(this).attr('href');
// });
// return $.makeArray(links);
// }
function extractArticle(link, cb){
$.get(link, function(res){
var el = $(res)
var d = new Date(el.find('#entry_date_url').text())
var data = {
date: d.toISOString(),
title: el.find('#entry_title').val(),
body: el.find('#entry_body').text(),
ftype: 'html',
source: 'penzu'
}
cb(null, data);
//console.log(data)
});
}
function saveToFile(data, cb){
$.ajax({
crossDomain: true,
url: 'http://localhost:5000/save',
type: 'POST',
data: data,
//dataType: 'json',
success: function(){
console.log('saved');
cb(null);
},
error: function(er, x){
console.warn(x);
cb('cannot save data')
}
});
}
})();
@janakact
Copy link

You can't extract the title, mabe because it is a textarea. So instead I had to extract the previous title from the 'previous' link at the bottom. Then I shifted the column in post processing.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment