Skip to content

Instantly share code, notes, and snippets.

@pnutmath
Last active July 27, 2023 10:18
Show Gist options
  • Save pnutmath/0b52dd13c67b6dbce2cdf1146447ed87 to your computer and use it in GitHub Desktop.
Save pnutmath/0b52dd13c67b6dbce2cdf1146447ed87 to your computer and use it in GitHub Desktop.
Scrape documentation and post to QnAs discourse
import fetch from 'node-fetch';
import cheerio from 'cheerio';
import { Configuration, OpenAIApi } from 'openai';
const OPENAI_API_KEY = '<YOUR_OPENAI_API_KEY>';
const DISCOURSE_SERVER_URL = '<YOUR_DISCOURSE_SERVER_URL>';
const DISCOURSE_API_KEY = '<YOUR_DISCOURSE_API_KEY>';
const DISCOURSE_POST_CREATORS = ['<CREATOR1>', '<CREATOR2>'];
const DISCOURSE_PRODUCT_MODERATORS = ['<MODERATOR1>', '<MODERATOR2>'];
const CATEGORY_ID = 1;
const ROOT_DOCUMENTATION_PATH = 'https://abhay.dev/'
const configuration = new Configuration({ apiKey: OPENAI_API_KEY });
const openai = new OpenAIApi(configuration);
// Parse the webpage content
const parseContent = async (url) => {
const result = await fetch(url);
const text = await result.text();
const $ = cheerio.load(text);
// Remove irrelevant elements
$('script, noscript, style, footer, nav, iframe').remove();
const bodyContent = $('body').text().trim();
const pageTitle = $('title').text();
return { pageTitle, bodyContent, url };
};
// Crawl function
const crawlAndParse = async (rootURL) => {
let queue = [rootURL];
let seen = new Set(queue);
let crawledPages = [];
while (queue.length) {
const currentUrl = queue.pop();
try {
const content = await parseContent(currentUrl);
crawledPages.push(content);
const result = await fetch(currentUrl);
const text = await result.text();
const $ = cheerio.load(text);
// Collect hyperlinks on the current page
let hyperlinks = $('a[href]').map((_, el) => $(el).attr('href')).get();
// Clean and filter links
let cleanLinks = hyperlinks.map(link => {
let cleanLink = new URL(link, currentUrl).toString();
if (cleanLink.startsWith(`${rootURL}/`)) {
// remove fragment identifiers
cleanLink = cleanLink.split('#')[0];
return cleanLink;
}
}).filter(link => link !== undefined);
// Add unique links to the queue
cleanLinks.forEach(link => {
if (!seen.has(link)) {
queue.push(link);
seen.add(link);
}
});
} catch (err) {
console.error(`Failed to process ${currentUrl}: ${err}`);
}
}
return crawledPages;
};
// Generate Q&A using OpenAI API
const generateQA = async ({ pageTitle, bodyContent, url }) => {
console.info(`Generating Q&A for ${url}`);
console.time(`generateQA:${url}`);
const completion = await openai.createChatCompletion({
model: 'gpt-4',
messages: [
{
role: 'system',
content: `You are an AI trained to assist moderators of Discourse servers in generating initial posts from existing documentation. You're given the title and content of a webpage, along with its URL. Your task is to analyze the content and formulate a multiple relevant discussion topic (t), craft a detailed question to instigate discussion (q), and prepare an answer(a). The answer should be in Markdown format and include the reference document URL. Double escape the newline characters in answer. The response should be a JSON array with keys 't', 'q', and 'a'.`
},
{
role: 'user',
content: `Page Title: ${pageTitle}\n\nContent:\n${bodyContent}\n\nURL: ${url}`
}
],
})
.catch(err => console.error(err))
.finally(() => console.timeEnd(`generateQA:${url}`));
// Parse Q&As from the response
let qas = JSON.parse(completion.data.choices[0].message.content);
// Replace escaped newline characters with actual newline characters
qas = qas.map(qa => { qa.a = qa.a.replace(/\\n/g, '\n'); return qa; });
return qas;
};
// Function to select a random user from an array
const getRandomUser = users => users[Math.floor(Math.random() * users.length)];
function randomDate(start, end) {
return new Date(start.getTime() + Math.random() * (end.getTime() - start.getTime()));
}
const now = new Date();
const oneWeekAgo = new Date();
oneWeekAgo.setDate(now.getDate() - 7);
const randomDateTime = randomDate(oneWeekAgo, now);
// Post the Q&As to Discourse
const postToDiscourse = async (qa, category) => {
// Select a random user from the list of users
const creator = getRandomUser(DISCOURSE_POST_CREATORS);
const moderator = getRandomUser(DISCOURSE_PRODUCT_MODERATORS);
// Post the question as a new topic
const questionResponse = await fetch(`${DISCOURSE_SERVER_URL}posts.json`, {
method: 'POST',
headers: {
'Api-Key': DISCOURSE_API_KEY,
'Api-Username': creator,
'Content-Type': 'application/json'
},
body: JSON.stringify({
title: qa.t,
raw: qa.q,
category,
created_at: randomDate(oneWeekAgo, now).toISOString(),
// embed_url: '' // add documentation url if you want to embed it
})
}).catch(err => console.error(err));
const questionData = await questionResponse.json();
if (questionData.errors) {
console.error(JSON.stringify(questionData));
console.log(`(${qa.t} [Creator: ${creator}]) Failed to post question.`)
return;
}
console.info(`Question posted (${qa.t}): ${DISCOURSE_SERVER_URL}t/${questionData.topic_id}`);
// Post the answer as a reply to the new topic with a different user
const answerResponse = await fetch(`${DISCOURSE_SERVER_URL}posts.json`, {
method: 'POST',
headers: {
'Api-Key': DISCOURSE_API_KEY,
'Api-Username': moderator,
'Content-Type': 'application/json'
},
body: JSON.stringify({
raw: qa.a,
topic_id: questionData.topic_id, // topic_id of the question you've just created
})
}).catch(err => console.error(err));
const answerData = await answerResponse.json();
if (answerData.errors) {
console.error(JSON.stringify(answerData));
console.log(`(${qa.t} [Moderator: ${moderator}]) Failed to post answer.`)
return;
}
console.info(`Answer posted (${qa.t}): ${DISCOURSE_SERVER_URL}t/${answerData.topic_id}`);
};
// Main function to scrape the site, generate Q&As and post them
const main = async () => {
// Step 1: Crawl all pages and parse the content
const pagesContent = await crawlAndParse(ROOT_DOCUMENTATION_PATH);
console.info(`Pages crawled and found ${pagesContent.length} pages`);
for (const content of pagesContent) {
console.info(`Page: ${content.url}`);
}
// Step 2: Generate Q&A for each page
const allQAs = [];
for (const content of pagesContent) {
const qas = await generateQA(content);
console.info(`Generated ${qas.length} Q&As for ${content.url}`);
console.log(JSON.stringify(qas, null, 2))
allQAs.push(...qas);
}
// Step 3: Post all Q&As to Discourse
for (const qa of allQAs) {
await postToDiscourse(qa, CATEGORY_ID);
}
};
main();
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment