Skip to content

Instantly share code, notes, and snippets.

@jazzyjackson
Created April 24, 2023 05:51
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jazzyjackson/66ff45fb32224b60c82ecee558cf6d15 to your computer and use it in GitHub Desktop.
Save jazzyjackson/66ff45fb32224b60c82ecee558cf6d15 to your computer and use it in GitHub Desktop.
scrape chatorg.ai
// MAIN FUNCTION
// This function will scrape all the conversations from the chatorg website
// It will return an array of arrays, the outer array is of all conversation
// the inner array is a list of message pairs
// grab all anchors
// advance to the next anchor
// pull the current chat history
async function main(min = 0){
// click all the empty chakra links
let anchors = await clickEmptyChakraLinks()
conversations = []
// this is going to be a async for each of the anchors
// for each anchor, click it, wait for the page to load, then scrape the conversation history
for (let i = min; i < anchors.length; i++) {
// click the anchor
anchors[i].click()
let title = anchors[i].innerText
// wait for the page to load
// let container = await loadHistory(link, chatContainer)
await wait(3500)
// for each child of the container, get the name from aria-label of the avatar and the content from the div.chat-message-wrapper
// then push it to the conversations array
let conversation = {title, messages: []}
let container = document.querySelector('#chatlog')
for(let j = 0; j < container.children.length - 1; j++){ // length - 1 because last message isn't real
let child = container.children[j]
try {
console.log("querying ", j)
let name = child.querySelector('.chakra-avatar div').getAttribute('aria-label')
let html = child.querySelector('.chat-message-wrapper').innerHTML
conversation.messages.push({name, html})
} catch(e){
console.log("Failed to get name or html", j)
console.warn(e)
}
}
conversations[i] = conversation
console.log("PUSHED CONVERSATION", title)
}
// download the conversation history
downloadStringAsFile(JSON.stringify(conversations), 'conversationHistory.json')
}
// create a wait function to just throw some delay into the crawl
async function wait(ms){
return new Promise(resolve => setTimeout(resolve, ms))
}
// Function definitions BEGIN
// first grab all the chakra links, filter by a tags with no href
// perform an asynchronous while loop to click the empty chakra-links recursively until no more exist
// this is a recursive function that will click all the empty chakra-links
function clickEmptyChakraLinks() {
return new Promise(resolve => {
// grab all the chakra links, filter by a tags with no href
const emptyChakraLinks = Array.from(document.querySelectorAll('a.chakra-link')).filter(a => !a.href)
// if there are no more empty chakra links, resolve the promise
if (emptyChakraLinks.length === 0) {
resolve(document.querySelectorAll('a.chakra-link[href^="/chat"]'))
} else {
// click the first empty chakra link
emptyChakraLinks.forEach(a => a.click())
// wait for the page to load
setTimeout(() => {
// recursively call the function
clickEmptyChakraLinks().then(resolve)
}, 100)
}
})
}
// clicking on each link will take a moment to load the next conversation
// so we need to wait for the page to load before clicking the next link
// lets watch the #chatlog element for changes
// use a mutation observer on the document to detect when the chat history is loaded
// So I need to detect when a new chatlog is loaded
// what happens when a history is clicked, is that the chatlog is first removed, and then later reappended
// so I need to detect when the chatlog is removed, and then when it is reappended
// so for every mutation, set a flag once there is no longer a chatlog, and then next time chatlog does exist, resolve the promise with the chatlog
function loadHistory(element, container){
console.log("LOADING HISTORY", element.innerText)
let chatlogRemoved = false
element.click()
// within the mutation observer, resolve once #chatlog exists & the removed flag had been set
return new Promise((resolve) => {
const observer = new MutationObserver(() => {
// if the chat history is removed
if (!container.querySelector('#chatlog')) {
// set the removed flag
chatlogRemoved = true
console.log('!!! chatlog removed')
} else if (chatlogRemoved) {
// stop observing the document
observer.disconnect()
console.log('!!! resolving')
// resolve the promise with the chatlog
resolve(container.querySelector('#chatlog'))
}
})
// start observing the document
observer.observe(container, { childList: true, subtree: true })
})
}
function downloadStringAsFile(str, filename = 'download.txt') {
// Create a hidden button element
const btn = document.createElement('button');
btn.style.display = 'none';
document.body.appendChild(btn);
// Create a Blob object from the input string
const blob = new Blob([str], { type: 'text/plain' });
const url = URL.createObjectURL(blob);
// Create an anchor element with download attribute
const link = document.createElement('a');
link.href = url;
link.download = filename;
// Add the anchor element to the button
btn.appendChild(link);
// Add a click event listener to the button to trigger the file download
btn.addEventListener('click', (event) => {
link.click();
btn.remove()
});
// Click the button programmatically
btn.click();
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment