Skip to content

Instantly share code, notes, and snippets.

@mirontoli
Last active March 3, 2024 15:18
Show Gist options
  • Save mirontoli/a3dd9d9618477f1ddc5311c509bb8bab to your computer and use it in GitHub Desktop.
Save mirontoli/a3dd9d9618477f1ddc5311c509bb8bab to your computer and use it in GitHub Desktop.
// https://gist.github.com/mirontoli/a3dd9d9618477f1ddc5311c509bb8bab
/*
set up a project
npm init
npm install axios
npm install puppeteer-core
The node version I had in this project is v18.17.1
download the file:
curl -O https://gist.githubusercontent.com/mirontoli/a3dd9d9618477f1ddc5311c509bb8bab/raw/penzu-export.js
start Chrome with debugging on:
Start-Process Chrome --remote-debugging-port=9222
update the params below:
journalId and the most recent post are in the url:
https://penzu.com/journals/{journalId}/{mostRecentPostId}
node penzu-export.js
*/
const puppeteer = require('puppeteer');
const fs = require('node:fs');
let processed_ids = [];
let posts = [];
let counter = 0;
let firstRow = false;
const journalId = '9236611';
const mostRecentPostId = '90472264';
const fileName = 'posts.json';
//do not have less than 10 sec, it will cause 429 http error
const minimumDelayMs = 1000;
const cache = {};
// Log in to Penzu in Chrome
// Copy the page url of the most recent post
// it will then go back and get previous posts automatically, but it needs to be the most recent article/post
const pageUrlMostRecentPost = `https://penzu.com/journals/${journalId}/${mostRecentPostId}`;
function writeToFile(text) {
fs.appendFile(fileName, text, err => {
if (err) {
console.error(err);
} else {
// file written successfully
}
});
}
async function downloadJournalPosts() {
//const wsChromeEndpointurl = 'ws://127.0.0.1:9222/devtools/browser/250348f7-b51b-4de5-a7e1-b1e2c4bef3dd';
const browser = await puppeteer.connect({
browserWSEndpoint: wsChromeEndpointurl,
});
const page = await browser.newPage();
// https://docs.apify.com/academy/node-js/caching-responses-in-puppeteer
await page.setRequestInterception(true);
page.on('request', async (request) => {
const url = request.url();
if (cache[url]) {
//console.log(`wow, this is from cache: ${url}`);
await request.respond(cache[url]);
return;
}
request.continue();
});
page.on('response', async (response) => {
const url = response.url();
const isPost = url.startsWith(`https://penzu.com/api/journals/${journalId}/entries/`) && !url.endsWith("/photos");
if (isPost) {
counter += 1;
const body = await response.json();
const entry = body?.entry;
const history = body?.previous;
if (entry) {
console.log(`${counter} id: ${entry.id}`);
const p = {
id: entry.id,
created_at: entry.created_at,
title: entry.title,
plaintext: entry.plaintext_body,
//richtext_body: entry.richtext_body,
tags: entry.tags.map(t => t.name),
}
posts.push(p);
const post = JSON.stringify(p);
let row = `,\n${post}`;
// treat the first row differently
if(firstRow) {
row = `[\n${post}`;
firstRow = false;
}
writeToFile(row);
processed_ids.push(p.id);
} else {
console.error("no entry!")
}
let mostPrevious = null;
if (history && history.length > 0) {
let uniquePreviousFound = false;
let index = 0;
while(!uniquePreviousFound && index < history.length) {
mostPrevious = history[index]?.entry;
// to avoid loop back to
uniquePreviousFound = !processed_ids.includes(mostPrevious.id);
index += 1;
if(!uniquePreviousFound) {
console.log(`oops, the first previous is not unique, the page is on ${p.id}`);
}
}
if (mostPrevious) {
console.log(`mostPrevious id: ${mostPrevious.id}`);
gotoPrevious(mostPrevious.id)
} else {
console.error("There is not unique mostPrevious");
}
} else {
console.error("no history anymore");
writeToFile("\n]\n");
}
// ignore the noise
} else if (Object.keys(cache).length < 10){
if(url == "https://penzu.com/api/settings" ||
url == `https://penzu.com/api/journals/${journalId}` ||
url.startsWith(`https://penzu.com/api/journals/${journalId}/page_themes`) ||
url.startsWith(`https://penzu.com/api/journals/${journalId}/pad_themes`) ||
url.startsWith("https://syndication.twitter.com/settings") ||
url.startsWith("https://penzu.com/api/user/one_time_modal") ||
url.startsWith("https://penzu.com/api/tags") ||
url == "https://penzu.com/api/journals" ||
url.endsWith("photos")
) {
let buffer;
try {
buffer = await response.buffer();
} catch (error) {
// some responses do not contain buffer and do not need to be catched
return;
}
cache[url] = {
status: response.status(),
headers: response.headers(),
body: buffer,
};
}
}
});
await page.goto(pageUrlMostRecentPost, {
//waitUntil: 'load',
});
console.log("hej4");
function delay(time) {
return new Promise(function(resolve) {
setTimeout(resolve, time)
});
}
async function gotoPrevious (mostPreviousId) {
var mostPreviousUrl = `https://penzu.com/journals/${journalId}/${mostPreviousId}`;
// wait some seconds
let ms = minimumDelayMs + Math.floor(Math.random() * 5000, 0);
await delay(ms);
await page.goto(mostPreviousUrl, {
//waitUntil: 'load',
});
}
}
// you can skip this section if you prefer getting ws endpoint manually
// if so navigate to http://127.0.0.1:9222/json/version and copy the ws endpoint:
// for more info see
//https://medium.com/@jaredpotter1/connecting-puppeteer-to-existing-chrome-window-8a10828149e0
const axios = require('axios');
let wsChromeEndpointurl = '';
axios.get('http://127.0.0.1:9222/json/version').then(res => {
//console.log(res.data.webSocketDebuggerUrl);
wsChromeEndpointurl = res.data.webSocketDebuggerUrl;
console.log(`wsChromeEndpointurl ${wsChromeEndpointurl}`);
downloadJournalPosts();
});
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment