Last active
September 13, 2023 02:05
-
-
Save Richardn2002/1bce5d24812f31389f5584d15823e46f to your computer and use it in GitHub Desktop.
Zhihu chat history logger.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
const fs = require('fs'); | |
const secret = JSON.parse(fs.readFileSync('secret.json')); | |
const IMAGE_PATH = './out/images'; | |
const STICKER_PATH = './out/stickers'; | |
let historyPath; | |
let rawHistoryPath; | |
const STATUS_PATH = './status.json'; | |
if (!fs.existsSync(IMAGE_PATH)) { | |
fs.mkdirSync(IMAGE_PATH, {recursive: true}); | |
} | |
if (!fs.existsSync(STICKER_PATH)) { | |
fs.mkdirSync(STICKER_PATH); | |
} | |
const SENDER_NAME = secret.SENDER_NAME; | |
const RECEIVER_NAME = secret.RECEIVER_NAME; | |
const X_ZSE_93 = '101_3_2.0'; | |
const DC_0 = secret.DC_0; | |
//const X_ZST_81 = secret.X_ZST_81; | |
const USER_AGENT = secret.USER_AGENT; | |
const COOKIE = secret.COOKIE; | |
const SENDER_ID = secret.SENDER_ID; | |
const x_zse_96_2_0 = require('./x-zse-96-2.0-encrypt'); | |
const url = require('url'); | |
class ChatEntry { | |
constructor(href) { | |
this.after_id = url.parse(href, true).query.after_id; | |
} | |
getRequestOptions() { | |
const path = '/api/v4/chat?sender_id=' + SENDER_ID + '&after_id=' + this.after_id + '&limit=20'; | |
return { | |
hostname: 'www.zhihu.com', | |
port: 443, | |
path: path, | |
method: 'GET', | |
headers: { | |
'User-Agent': USER_AGENT, | |
'x-zse-93': X_ZSE_93, | |
'x-zse-96': x_zse_96_2_0([X_ZSE_93, path, DC_0].join('+')), | |
'Cookie': COOKIE | |
} | |
} | |
} | |
} | |
let taskCounter = 0; | |
class Message { | |
constructor(message) { | |
this.id = message.id; | |
this.timestamp = message.created_time; | |
this.type = message.content_type; | |
this.dateString = (new Date(this.timestamp * 1000 + 8 * 3600 * 1000)).toISOString(); | |
this.speaker = message.user_type == 'receiver' ? RECEIVER_NAME : SENDER_NAME; | |
if (message.is_canceled) { | |
this.text = '[Message recalled.]'; | |
} else { | |
switch (this.type) { | |
case 0: | |
this.text = message.text; | |
break; | |
case 1: | |
this.contentUrl = message.image.url; | |
this.contentNaming = this.id + this.contentUrl.match(/\.[0-9a-z]+$/i)[0]; // properly set extension name | |
this.text = '[Image: ' + this.contentNaming + ']'; | |
break; | |
case 2: | |
this.contentUrl = message.sticker.url; | |
this.contentNaming = (message.sticker.title ? message.sticker.title : message.sticker.sticker_id) + this.contentUrl.match(/\.[0-9a-z]+$/i)[0]; | |
// some stickers do not have the title property. f**k zhihu. | |
this.text = '[Sticker: ' + this.contentNaming + ']'; | |
break; | |
} | |
} | |
} | |
downloadContent() { | |
let messageId = this.id; | |
let finish = function() { | |
status.progress = Math.min(status.progress, messageId); | |
// min is for in case the more recent file completes download later than older ones | |
fs.writeFile(STATUS_PATH, JSON.stringify(status), () => { | |
taskCounter --; | |
continueGuard(); | |
}); | |
} | |
if (this.type !== 0) { | |
let filePath = (this.type == 1 ? IMAGE_PATH : STICKER_PATH) + '/' + this.contentNaming; | |
let contentUrl = this.contentUrl; | |
if (!fs.existsSync(filePath)) { | |
https.request(contentUrl, function(response) { | |
let data = new Stream(); | |
response.on('data', function(chunk) { | |
data.push(chunk); | |
}); | |
response.on('end', function() { | |
fs.writeFile(filePath, data.read(), finish); | |
}); | |
response.on('error', (e) => { | |
console.log('\nError downloading ' + filePath + ': ' + e); | |
console.log('Url: ' + contentUrl + '\n'); | |
finish(); | |
}) | |
}).end(); | |
} else { | |
finish(); | |
} | |
} else { | |
finish(); | |
} | |
} | |
toString() { | |
return this.dateString.slice(0, 10) + ' ' + this.dateString.slice(-13, -5) + ' ' + this.speaker + ':\n' + this.text + '\n'; | |
} | |
toObject() { | |
return {id: this.id, timestamp: this.timestamp, type: this.type, text: this.text, contentNaming: this.contentNaming, contentUrl: this.contentUrl}; | |
} | |
} | |
// Retrieve chat history logging status | |
let status; | |
if (fs.existsSync(STATUS_PATH)) { | |
status = JSON.parse(fs.readFileSync(STATUS_PATH)); | |
} else { | |
status = {latest: 0, progress: Infinity, isEnd: false}; | |
} | |
let path; | |
if (status.latest == 0 || status.isEnd) { | |
// a new entry point is needed | |
path = '/api/v4/chat?sender_id=' + SENDER_ID; | |
} else { | |
// continue logging since a specific message id | |
// this id is also used as the name for the history file starting from it | |
historyPath = './out/' + status.latest + '.txt'; | |
rawHistoryPath = './out/' + status.latest + '.rawlist'; | |
path = '/api/v4/chat?sender_id=' + SENDER_ID + '&after_id=' + status.progress + '&limit=20'; | |
} | |
const https = require('https'); | |
const Stream = require('stream').Transform; | |
// Initiate history pull | |
let previousLatest; | |
let currentChunk; | |
https.request({ | |
hostname: 'www.zhihu.com', | |
port: 443, | |
path: path, | |
method: 'GET', | |
headers: { | |
'User-Agent': USER_AGENT, | |
'x-zse-93': X_ZSE_93, | |
'x-zse-96': x_zse_96_2_0([X_ZSE_93, path, DC_0].join('+')), | |
'Cookie': COOKIE, | |
} | |
}, (res) => { | |
let data = ''; | |
res.on('data', (d) => { | |
data += d; | |
}); | |
res.on('end', () => { | |
currentChunk = JSON.parse(data.toString()); | |
if (status.latest == 0 || status.isEnd) { | |
if (currentChunk.data.messages[0].id === status.latest) { | |
// no new messages to log | |
return; | |
} | |
previousLatest = status.latest; | |
status.latest = currentChunk.data.messages[0].id; | |
historyPath = './out/' + status.latest + '.txt'; | |
rawHistoryPath = './out/' + status.latest + '.rawlist'; | |
processMessageArray(currentChunk.data.messages); | |
} else { | |
processMessageArray(currentChunk.data.messages.slice(1)); | |
// exclude the duplicate head (specifically, the message with id after_id) | |
} | |
}); | |
}).end(); | |
function processPage(chatEntry) { | |
https.request(chatEntry.getRequestOptions(), (res) => { | |
let data = ''; | |
res.on('data', (d) => { | |
data += d; | |
}) | |
res.on('end', () => { | |
currentChunk = JSON.parse(data.toString()); | |
processMessageArray(currentChunk.data.messages); | |
}) | |
}).end(); | |
} | |
function processMessageArray(array) { | |
taskCounter += array.length; | |
array.forEach((message) => { | |
let messageObject = new Message(message); | |
messageObject.downloadContent(); | |
fs.appendFileSync(historyPath, messageObject.toString() + '\n'); | |
fs.appendFileSync(rawHistoryPath, JSON.stringify(messageObject) + ','); | |
process.stdout.write(messageObject.dateString + '\r'); | |
}); | |
} | |
function continueGuard() { | |
if (taskCounter === 0) { | |
if (status.progress > previousLatest) { | |
if (currentChunk.paging.is_end) { | |
status.isEnd = true; | |
fs.writeFile(STATUS_PATH, JSON.stringify(status), () => {console.log('History end reached.')}); | |
} else { | |
processPage(new ChatEntry(currentChunk.paging.next)); | |
} | |
} else { | |
status.isEnd = true; | |
fs.writeFile(STATUS_PATH, JSON.stringify(status), () => {console.log('Previous entry point reached.')}); | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
x-zse-96-2.0-encrypt.js
: