Last active
March 3, 2024 15:18
-
-
Save mirontoli/a3dd9d9618477f1ddc5311c509bb8bab to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// https://gist.github.com/mirontoli/a3dd9d9618477f1ddc5311c509bb8bab | |
/* | |
set up a project | |
npm init | |
npm install axios | |
npm install puppeteer-core | |
The node version I had in this project is v18.17.1 | |
download the file: | |
curl -O https://gist.githubusercontent.com/mirontoli/a3dd9d9618477f1ddc5311c509bb8bab/raw/penzu-export.js | |
start Chrome with debugging on: | |
Start-Process Chrome --remote-debugging-port=9222 | |
update the params below: | |
journalId and the most recent post are in the url: | |
https://penzu.com/journals/{journalId}/{mostRecentPostId} | |
node penzu-export.js | |
*/ | |
const puppeteer = require('puppeteer'); | |
const fs = require('node:fs'); | |
let processed_ids = []; | |
let posts = []; | |
let counter = 0; | |
let firstRow = false; | |
const journalId = '9236611'; | |
const mostRecentPostId = '90472264'; | |
const fileName = 'posts.json'; | |
//do not have less than 10 sec, it will cause 429 http error | |
const minimumDelayMs = 1000; | |
const cache = {}; | |
// Log in to Penzu in Chrome | |
// Copy the page url of the most recent post | |
// it will then go back and get previous posts automatically, but it needs to be the most recent article/post | |
const pageUrlMostRecentPost = `https://penzu.com/journals/${journalId}/${mostRecentPostId}`; | |
function writeToFile(text) { | |
fs.appendFile(fileName, text, err => { | |
if (err) { | |
console.error(err); | |
} else { | |
// file written successfully | |
} | |
}); | |
} | |
async function downloadJournalPosts() { | |
//const wsChromeEndpointurl = 'ws://127.0.0.1:9222/devtools/browser/250348f7-b51b-4de5-a7e1-b1e2c4bef3dd'; | |
const browser = await puppeteer.connect({ | |
browserWSEndpoint: wsChromeEndpointurl, | |
}); | |
const page = await browser.newPage(); | |
// https://docs.apify.com/academy/node-js/caching-responses-in-puppeteer | |
await page.setRequestInterception(true); | |
page.on('request', async (request) => { | |
const url = request.url(); | |
if (cache[url]) { | |
//console.log(`wow, this is from cache: ${url}`); | |
await request.respond(cache[url]); | |
return; | |
} | |
request.continue(); | |
}); | |
page.on('response', async (response) => { | |
const url = response.url(); | |
const isPost = url.startsWith(`https://penzu.com/api/journals/${journalId}/entries/`) && !url.endsWith("/photos"); | |
if (isPost) { | |
counter += 1; | |
const body = await response.json(); | |
const entry = body?.entry; | |
const history = body?.previous; | |
if (entry) { | |
console.log(`${counter} id: ${entry.id}`); | |
const p = { | |
id: entry.id, | |
created_at: entry.created_at, | |
title: entry.title, | |
plaintext: entry.plaintext_body, | |
//richtext_body: entry.richtext_body, | |
tags: entry.tags.map(t => t.name), | |
} | |
posts.push(p); | |
const post = JSON.stringify(p); | |
let row = `,\n${post}`; | |
// treat the first row differently | |
if(firstRow) { | |
row = `[\n${post}`; | |
firstRow = false; | |
} | |
writeToFile(row); | |
processed_ids.push(p.id); | |
} else { | |
console.error("no entry!") | |
} | |
let mostPrevious = null; | |
if (history && history.length > 0) { | |
let uniquePreviousFound = false; | |
let index = 0; | |
while(!uniquePreviousFound && index < history.length) { | |
mostPrevious = history[index]?.entry; | |
// to avoid loop back to | |
uniquePreviousFound = !processed_ids.includes(mostPrevious.id); | |
index += 1; | |
if(!uniquePreviousFound) { | |
console.log(`oops, the first previous is not unique, the page is on ${p.id}`); | |
} | |
} | |
if (mostPrevious) { | |
console.log(`mostPrevious id: ${mostPrevious.id}`); | |
gotoPrevious(mostPrevious.id) | |
} else { | |
console.error("There is not unique mostPrevious"); | |
} | |
} else { | |
console.error("no history anymore"); | |
writeToFile("\n]\n"); | |
} | |
// ignore the noise | |
} else if (Object.keys(cache).length < 10){ | |
if(url == "https://penzu.com/api/settings" || | |
url == `https://penzu.com/api/journals/${journalId}` || | |
url.startsWith(`https://penzu.com/api/journals/${journalId}/page_themes`) || | |
url.startsWith(`https://penzu.com/api/journals/${journalId}/pad_themes`) || | |
url.startsWith("https://syndication.twitter.com/settings") || | |
url.startsWith("https://penzu.com/api/user/one_time_modal") || | |
url.startsWith("https://penzu.com/api/tags") || | |
url == "https://penzu.com/api/journals" || | |
url.endsWith("photos") | |
) { | |
let buffer; | |
try { | |
buffer = await response.buffer(); | |
} catch (error) { | |
// some responses do not contain buffer and do not need to be catched | |
return; | |
} | |
cache[url] = { | |
status: response.status(), | |
headers: response.headers(), | |
body: buffer, | |
}; | |
} | |
} | |
}); | |
await page.goto(pageUrlMostRecentPost, { | |
//waitUntil: 'load', | |
}); | |
console.log("hej4"); | |
function delay(time) { | |
return new Promise(function(resolve) { | |
setTimeout(resolve, time) | |
}); | |
} | |
async function gotoPrevious (mostPreviousId) { | |
var mostPreviousUrl = `https://penzu.com/journals/${journalId}/${mostPreviousId}`; | |
// wait some seconds | |
let ms = minimumDelayMs + Math.floor(Math.random() * 5000, 0); | |
await delay(ms); | |
await page.goto(mostPreviousUrl, { | |
//waitUntil: 'load', | |
}); | |
} | |
} | |
// you can skip this section if you prefer getting ws endpoint manually | |
// if so navigate to http://127.0.0.1:9222/json/version and copy the ws endpoint: | |
// for more info see | |
//https://medium.com/@jaredpotter1/connecting-puppeteer-to-existing-chrome-window-8a10828149e0 | |
const axios = require('axios'); | |
let wsChromeEndpointurl = ''; | |
axios.get('http://127.0.0.1:9222/json/version').then(res => { | |
//console.log(res.data.webSocketDebuggerUrl); | |
wsChromeEndpointurl = res.data.webSocketDebuggerUrl; | |
console.log(`wsChromeEndpointurl ${wsChromeEndpointurl}`); | |
downloadJournalPosts(); | |
}); | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment