Skip to content

Instantly share code, notes, and snippets.

@ceifa
Created April 12, 2023 20:16
Show Gist options
  • Save ceifa/a607e715247dd90be7337d358c1d0769 to your computer and use it in GitHub Desktop.
Save ceifa/a607e715247dd90be7337d358c1d0769 to your computer and use it in GitHub Desktop.
openai web-qa node.js
import axios from 'axios';
import { load } from 'cheerio';
import fs from 'node:fs';
import { setTimeout } from 'node:timers/promises';
import { createObjectCsvWriter } from 'csv-writer'
import tiktoken from 'tiktoken-node'
import { OpenAIApi, Configuration } from 'openai'
import distance from 'compute-cosine-distance';
const domain = ''
const full_url = ''
const openai = new OpenAIApi(new Configuration({
apiKey: ''
}))
const getHyperlinks = async (url) => {
try {
const response = await axios.get(url);
if (response.status !== 200 || response.headers['content-type'].indexOf('text/html') === -1) {
return [];
}
const $ = load(response.data);
const hyperlinks = [];
$('a[href]').each((_, element) => {
hyperlinks.push($(element).attr('href'));
});
return hyperlinks;
} catch (error) {
return [];
}
}
const getDomainHyperlinks = async (localDomain, url) => {
const hyperlinks = await getHyperlinks(url)
const cleanLinks = [];
hyperlinks.forEach((link) => {
let cleanLink = null;
if (/^http[s]*:\/\/.+/.test(link)) {
const urlObj = new URL(link);
if (urlObj.hostname === localDomain) {
cleanLink = `${urlObj.protocol}//${urlObj.hostname}${urlObj.pathname}`;
}
} else {
if (link.startsWith('/')) {
link = link.substr(1);
} else if (link.startsWith('#') || link.startsWith('mailto:') || link.startsWith('www.')) {
return;
}
cleanLink = `https://${localDomain}/${link.split(/[?#]/)[0]}`;
}
if (cleanLink) {
if (cleanLink.endsWith('/')) {
cleanLink = cleanLink.slice(0, -1);
}
cleanLinks.push(cleanLink);
}
});
return Array.from(new Set(cleanLinks));
}
const crawl = async (url) => {
const localDomain = new URL(url).hostname;
const queue = [url];
const seen = new Set([url]);
if (!fs.existsSync("text/")) {
fs.mkdirSync("text/");
}
if (!fs.existsSync(`text/${localDomain}/`)) {
fs.mkdirSync(`text/${localDomain}/`);
}
while (queue.length > 0) {
const currentUrl = queue.shift();
console.log(currentUrl);
const fileName = currentUrl.replace(/https?:\/\//, '').replace(/:\d+/g, '').replace(/\//g, '_');
const filePath = `text/${localDomain}/${fileName}.txt`;
await setTimeout(100)
try {
const response = await axios.get(currentUrl);
if (response.status !== 200 || response.headers['content-type'].indexOf('text/html') === -1) {
continue;
}
const $ = load(response.data);
$('script, style, link, noscript').remove();
const text = $('body').text().replace(/\n+\s*/g, '\n').replace(/ +/g, ' ').trim();
fs.writeFileSync(filePath, text);
const hyperlinks = await getDomainHyperlinks(localDomain, currentUrl);
hyperlinks.forEach((link) => {
if (!seen.has(link)) {
queue.push(link);
seen.add(link);
}
});
} catch (error) {
console.log('Failed to process', currentUrl);
}
}
}
console.log('Crawling...');
await crawl(full_url);
console.log('Done crawling');
console.log('Processing...');
let texts = [];
const tokenizer = tiktoken.getEncoding('cl100k_base')
const files = await fs.promises.readdir(`text/` + domain + '/');
for (const file of files) {
const text = await fs.promises.readFile(`text/` + domain + '/' + file, 'utf8');
const fixedText = text.replace(/\n/g, ' ').replace(/\\n/g, ' ').replace(/\s+/g, ' ')
const filename = file.slice(domain.length, -4)
if (filename && fixedText) {
texts.push({
fname: filename,
text: fixedText,
n_tokens: tokenizer.encode(fixedText).length,
});
}
}
if (!fs.existsSync("processed/")) {
fs.mkdirSync("processed/");
}
const csvWriter = createObjectCsvWriter({
path: 'processed/scraped.csv',
header: [
{ id: 'fname', title: 'fname' },
{ id: 'text', title: 'text' },
]
});
await csvWriter.writeRecords(texts);
console.log('Done processing');
const maxTokens = 500
const splitIntoMany = (text) => {
const sentences = text.split('. ');
const nTokens = sentences.map((sentence) => tokenizer.encode(' ' + sentence).length);
const chunks = [];
let tokensSoFar = 0;
let chunk = [];
for (let i = 0; i < sentences.length; i++) {
const sentence = sentences[i];
const token = nTokens[i];
if (tokensSoFar + token > maxTokens) {
chunks.push(chunk.join('. ') + '.');
chunk = [];
tokensSoFar = 0;
}
if (token > maxTokens) {
continue;
}
chunk.push(sentence);
tokensSoFar += token + 1;
}
if (chunk) {
chunks.push(chunk.join('. ') + '.');
}
return chunks;
}
const shortened = []
for (const text of texts) {
if (text.n_tokens > maxTokens) {
shortened.push(...splitIntoMany(text.text))
} else {
shortened.push(text.text)
}
}
console.log('Embedding...');
const df = await Promise.all(shortened
.filter(t => t !== '.')
.map(async (text) => ({
text,
n_tokens: tokenizer.encode(text).length,
embeddings: await openai.createEmbedding({
input: text,
model: 'text-embedding-ada-002',
}).then((res) => res.data.data[0].embedding)
})))
const csvWriter2 = createObjectCsvWriter({
path: 'processed/embeddings.csv',
header: [
{ id: 'text', title: 'text' },
{ id: 'n_tokens', title: 'n_tokens' },
{ id: 'embeddings', title: 'embeddings' },
]
});
await csvWriter2.writeRecords([...df].map((row) => ({
text: row.text,
n_tokens: row.n_tokens,
embeddings: JSON.stringify(row.embeddings),
})));
console.log('Done embedding');
const create_context = async (question, max_len = 1800) => {
const q_embeddings = await openai.createEmbedding({
input: question,
model: 'text-embedding-ada-002',
})
const embeddings = q_embeddings.data.data[0].embedding
const sortedDf = [...df].map((row) => {
return {
...row,
distance: distance(embeddings, row.embeddings)
}
}).sort((a, b) => a.distance - b.distance)
const returns = []
let cur_len = 0
for (let i = 0; i < sortedDf.length; i++) {
const row = df[i]
cur_len += row.n_tokens + 4
if (cur_len > max_len) {
break
}
returns.push(row.text)
}
return returns.join('\n\n###\n\n')
}
const answer_question = async (question, max_len = 1800) => {
const context = await create_context(question, max_len)
console.log('Context:\n' + context + '\n\n')
console.log('Question:\n' + question + '\n\n')
console.log('Answering...')
try {
const response = await openai.createCompletion({
prompt: `Responda a pergunta com base no contexto abaixo e, se a pergunta não puder ser respondida com base no contexto, diga "Não sei"\n\nContexto: ${context}\n\n---\n\nPergunta: ${question}\nResposta:`,
temperature: 0,
max_tokens: 150,
top_p: 1,
frequency_penalty: 0,
presence_penalty: 0,
stop: null,
model: 'text-davinci-003',
})
return response.data.choices[0].text.trim()
} catch (e) {
console.log(e)
return ''
}
}
console.log('Answering...');
// Read from stdin
process.stdin.resume();
process.stdin.setEncoding('utf8');
// Read from stdin
process.stdin.on('data', async function (chunk) {
const question = chunk.trim()
if (question) {
console.log(await answer_question(question))
}
});
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment