ceifa/web-qa.js

## web-qa.js
import axios from 'axios';
import { load } from 'cheerio';
import fs from 'node:fs';
import { setTimeout } from 'node:timers/promises';
import { createObjectCsvWriter } from 'csv-writer'
import tiktoken from 'tiktoken-node'
import { OpenAIApi, Configuration } from 'openai'
import distance from 'compute-cosine-distance';

const domain = ''
const full_url = ''

const openai = new OpenAIApi(new Configuration({
    apiKey: ''
}))

const getHyperlinks = async (url) => {
    try {
        const response = await axios.get(url);
        if (response.status !== 200 || response.headers['content-type'].indexOf('text/html') === -1) {
            return [];
        }

        const $ = load(response.data);
        const hyperlinks = [];

        $('a[href]').each((_, element) => {
            hyperlinks.push($(element).attr('href'));
        });

        return hyperlinks;
    } catch (error) {
        return [];
    }
}

const getDomainHyperlinks = async (localDomain, url) => {
    const hyperlinks = await getHyperlinks(url)
    const cleanLinks = [];

    hyperlinks.forEach((link) => {
        let cleanLink = null;

        if (/^http[s]*:\/\/.+/.test(link)) {
            const urlObj = new URL(link);

            if (urlObj.hostname === localDomain) {
                cleanLink = `${urlObj.protocol}//${urlObj.hostname}${urlObj.pathname}`;
            }
        } else {
            if (link.startsWith('/')) {
                link = link.substr(1);
            } else if (link.startsWith('#') || link.startsWith('mailto:') || link.startsWith('www.')) {
                return;
            }

            cleanLink = `https://${localDomain}/${link.split(/[?#]/)[0]}`;
        }

        if (cleanLink) {
            if (cleanLink.endsWith('/')) {
                cleanLink = cleanLink.slice(0, -1);
            }

            cleanLinks.push(cleanLink);
        }
    });

    return Array.from(new Set(cleanLinks));
}

const crawl = async (url) => {
    const localDomain = new URL(url).hostname;
    const queue = [url];
    const seen = new Set([url]);

    if (!fs.existsSync("text/")) {
        fs.mkdirSync("text/");
    }

    if (!fs.existsSync(`text/${localDomain}/`)) {
        fs.mkdirSync(`text/${localDomain}/`);
    }

    while (queue.length > 0) {
        const currentUrl = queue.shift();
        console.log(currentUrl);

        const fileName = currentUrl.replace(/https?:\/\//, '').replace(/:\d+/g, '').replace(/\//g, '_');
        const filePath = `text/${localDomain}/${fileName}.txt`;

        await setTimeout(100)

        try {
            const response = await axios.get(currentUrl);
            if (response.status !== 200 || response.headers['content-type'].indexOf('text/html') === -1) {
                continue;
            }

            const $ = load(response.data);
            $('script, style, link, noscript').remove();
            const text = $('body').text().replace(/\n+\s*/g, '\n').replace(/ +/g, ' ').trim();

            fs.writeFileSync(filePath, text);

            const hyperlinks = await getDomainHyperlinks(localDomain, currentUrl);
            hyperlinks.forEach((link) => {
                if (!seen.has(link)) {
                    queue.push(link);
                    seen.add(link);
                }
            });
        } catch (error) {
            console.log('Failed to process', currentUrl);
        }
    }
}

console.log('Crawling...');
await crawl(full_url);
console.log('Done crawling');

console.log('Processing...');

let texts = [];

const tokenizer = tiktoken.getEncoding('cl100k_base')

const files = await fs.promises.readdir(`text/` + domain + '/');
for (const file of files) {
    const text = await fs.promises.readFile(`text/` + domain + '/' + file, 'utf8');
    const fixedText = text.replace(/\n/g, ' ').replace(/\\n/g, ' ').replace(/\s+/g, ' ')
    const filename = file.slice(domain.length, -4)
    if (filename && fixedText) {
        texts.push({
            fname: filename,
            text: fixedText,
            n_tokens: tokenizer.encode(fixedText).length,
        });
    }
}

if (!fs.existsSync("processed/")) {
    fs.mkdirSync("processed/");
}

const csvWriter = createObjectCsvWriter({
    path: 'processed/scraped.csv',
    header: [
        { id: 'fname', title: 'fname' },
        { id: 'text', title: 'text' },
    ]
});

await csvWriter.writeRecords(texts);

console.log('Done processing');

const maxTokens = 500

const splitIntoMany = (text) => {
    const sentences = text.split('. ');
    const nTokens = sentences.map((sentence) => tokenizer.encode(' ' + sentence).length);
    const chunks = [];
    let tokensSoFar = 0;
    let chunk = [];

    for (let i = 0; i < sentences.length; i++) {
        const sentence = sentences[i];
        const token = nTokens[i];

        if (tokensSoFar + token > maxTokens) {
            chunks.push(chunk.join('. ') + '.');
            chunk = [];
            tokensSoFar = 0;
        }

        if (token > maxTokens) {
            continue;
        }

        chunk.push(sentence);
        tokensSoFar += token + 1;
    }

    if (chunk) {
        chunks.push(chunk.join('. ') + '.');
    }

    return chunks;
}

const shortened = []

for (const text of texts) {
    if (text.n_tokens > maxTokens) {
        shortened.push(...splitIntoMany(text.text))
    } else {
        shortened.push(text.text)
    }
}

console.log('Embedding...');

const df = await Promise.all(shortened
    .filter(t => t !== '.')
    .map(async (text) => ({
        text,
        n_tokens: tokenizer.encode(text).length,
        embeddings: await openai.createEmbedding({
            input: text,
            model: 'text-embedding-ada-002',
        }).then((res) => res.data.data[0].embedding)
    })))

const csvWriter2 = createObjectCsvWriter({
    path: 'processed/embeddings.csv',
    header: [
        { id: 'text', title: 'text' },
        { id: 'n_tokens', title: 'n_tokens' },
        { id: 'embeddings', title: 'embeddings' },
    ]
});

await csvWriter2.writeRecords([...df].map((row) => ({
    text: row.text,
    n_tokens: row.n_tokens,
    embeddings: JSON.stringify(row.embeddings),
})));

console.log('Done embedding');

const create_context = async (question, max_len = 1800) => {
    const q_embeddings = await openai.createEmbedding({
        input: question,
        model: 'text-embedding-ada-002',
    })

    const embeddings = q_embeddings.data.data[0].embedding
    const sortedDf = [...df].map((row) => {
        return {
            ...row,
            distance: distance(embeddings, row.embeddings)
        }
    }).sort((a, b) => a.distance - b.distance)

    const returns = []
    let cur_len = 0

    for (let i = 0; i < sortedDf.length; i++) {
        const row = df[i]
        cur_len += row.n_tokens + 4

        if (cur_len > max_len) {
            break
        }

        returns.push(row.text)
    }

    return returns.join('\n\n###\n\n')
}

const answer_question = async (question, max_len = 1800) => {
    const context = await create_context(question, max_len)

    console.log('Context:\n' + context + '\n\n')

    console.log('Question:\n' + question + '\n\n')

    console.log('Answering...')

    try {
        const response = await openai.createCompletion({
            prompt: `Responda a pergunta com base no contexto abaixo e, se a pergunta não puder ser respondida com base no contexto, diga "Não sei"\n\nContexto: ${context}\n\n---\n\nPergunta: ${question}\nResposta:`,
            temperature: 0,
            max_tokens: 150,
            top_p: 1,
            frequency_penalty: 0,
            presence_penalty: 0,
            stop: null,
            model: 'text-davinci-003',
        })
        return response.data.choices[0].text.trim()
    } catch (e) {
        console.log(e)
        return ''
    }
}

console.log('Answering...');

// Read from stdin
process.stdin.resume();
process.stdin.setEncoding('utf8');

// Read from stdin
process.stdin.on('data', async function (chunk) {
    const question = chunk.trim()
    if (question) {
        console.log(await answer_question(question))
    }
});
	import axios from 'axios';
	import { load } from 'cheerio';
	import fs from 'node:fs';
	import { setTimeout } from 'node:timers/promises';
	import { createObjectCsvWriter } from 'csv-writer'
	import tiktoken from 'tiktoken-node'
	import { OpenAIApi, Configuration } from 'openai'
	import distance from 'compute-cosine-distance';

	const domain = ''
	const full_url = ''

	const openai = new OpenAIApi(new Configuration({
	apiKey: ''
	}))

	const getHyperlinks = async (url) => {
	try {
	const response = await axios.get(url);
	if (response.status !== 200 \|\| response.headers['content-type'].indexOf('text/html') === -1) {
	return [];
	}

	const $ = load(response.data);
	const hyperlinks = [];

	$('a[href]').each((_, element) => {
	hyperlinks.push($(element).attr('href'));
	});

	return hyperlinks;
	} catch (error) {
	return [];
	}
	}

	const getDomainHyperlinks = async (localDomain, url) => {
	const hyperlinks = await getHyperlinks(url)
	const cleanLinks = [];

	hyperlinks.forEach((link) => {
	let cleanLink = null;

	if (/^http[s]*:\/\/.+/.test(link)) {
	const urlObj = new URL(link);

	if (urlObj.hostname === localDomain) {
	cleanLink = `${urlObj.protocol}//${urlObj.hostname}${urlObj.pathname}`;
	}
	} else {
	if (link.startsWith('/')) {
	link = link.substr(1);
	} else if (link.startsWith('#') \|\| link.startsWith('mailto:') \|\| link.startsWith('www.')) {
	return;
	}

	cleanLink = `https://${localDomain}/${link.split(/[?#]/)[0]}`;
	}

	if (cleanLink) {
	if (cleanLink.endsWith('/')) {
	cleanLink = cleanLink.slice(0, -1);
	}

	cleanLinks.push(cleanLink);
	}
	});

	return Array.from(new Set(cleanLinks));
	}

	const crawl = async (url) => {
	const localDomain = new URL(url).hostname;
	const queue = [url];
	const seen = new Set([url]);

	if (!fs.existsSync("text/")) {
	fs.mkdirSync("text/");
	}

	if (!fs.existsSync(`text/${localDomain}/`)) {
	fs.mkdirSync(`text/${localDomain}/`);
	}

	while (queue.length > 0) {
	const currentUrl = queue.shift();
	console.log(currentUrl);

	const fileName = currentUrl.replace(/https?:\/\//, '').replace(/:\d+/g, '').replace(/\//g, '_');
	const filePath = `text/${localDomain}/${fileName}.txt`;

	await setTimeout(100)

	try {
	const response = await axios.get(currentUrl);
	if (response.status !== 200 \|\| response.headers['content-type'].indexOf('text/html') === -1) {
	continue;
	}

	const $ = load(response.data);
	$('script, style, link, noscript').remove();
	const text = $('body').text().replace(/\n+\s*/g, '\n').replace(/ +/g, ' ').trim();

	fs.writeFileSync(filePath, text);

	const hyperlinks = await getDomainHyperlinks(localDomain, currentUrl);
	hyperlinks.forEach((link) => {
	if (!seen.has(link)) {
	queue.push(link);
	seen.add(link);
	}
	});
	} catch (error) {
	console.log('Failed to process', currentUrl);
	}
	}
	}

	console.log('Crawling...');
	await crawl(full_url);
	console.log('Done crawling');

	console.log('Processing...');

	let texts = [];

	const tokenizer = tiktoken.getEncoding('cl100k_base')

	const files = await fs.promises.readdir(`text/` + domain + '/');
	for (const file of files) {
	const text = await fs.promises.readFile(`text/` + domain + '/' + file, 'utf8');
	const fixedText = text.replace(/\n/g, ' ').replace(/\\n/g, ' ').replace(/\s+/g, ' ')
	const filename = file.slice(domain.length, -4)
	if (filename && fixedText) {
	texts.push({
	fname: filename,
	text: fixedText,
	n_tokens: tokenizer.encode(fixedText).length,
	});
	}
	}

	if (!fs.existsSync("processed/")) {
	fs.mkdirSync("processed/");
	}

	const csvWriter = createObjectCsvWriter({
	path: 'processed/scraped.csv',
	header: [
	{ id: 'fname', title: 'fname' },
	{ id: 'text', title: 'text' },
	]
	});

	await csvWriter.writeRecords(texts);

	console.log('Done processing');

	const maxTokens = 500

	const splitIntoMany = (text) => {
	const sentences = text.split('. ');
	const nTokens = sentences.map((sentence) => tokenizer.encode(' ' + sentence).length);
	const chunks = [];
	let tokensSoFar = 0;
	let chunk = [];

	for (let i = 0; i < sentences.length; i++) {
	const sentence = sentences[i];
	const token = nTokens[i];

	if (tokensSoFar + token > maxTokens) {
	chunks.push(chunk.join('. ') + '.');
	chunk = [];
	tokensSoFar = 0;
	}

	if (token > maxTokens) {
	continue;
	}

	chunk.push(sentence);
	tokensSoFar += token + 1;
	}

	if (chunk) {
	chunks.push(chunk.join('. ') + '.');
	}

	return chunks;
	}

	const shortened = []

	for (const text of texts) {
	if (text.n_tokens > maxTokens) {
	shortened.push(...splitIntoMany(text.text))
	} else {
	shortened.push(text.text)
	}
	}

	console.log('Embedding...');

	const df = await Promise.all(shortened
	.filter(t => t !== '.')
	.map(async (text) => ({
	text,
	n_tokens: tokenizer.encode(text).length,
	embeddings: await openai.createEmbedding({
	input: text,
	model: 'text-embedding-ada-002',
	}).then((res) => res.data.data[0].embedding)
	})))

	const csvWriter2 = createObjectCsvWriter({
	path: 'processed/embeddings.csv',
	header: [
	{ id: 'text', title: 'text' },
	{ id: 'n_tokens', title: 'n_tokens' },
	{ id: 'embeddings', title: 'embeddings' },
	]
	});

	await csvWriter2.writeRecords([...df].map((row) => ({
	text: row.text,
	n_tokens: row.n_tokens,
	embeddings: JSON.stringify(row.embeddings),
	})));

	console.log('Done embedding');

	const create_context = async (question, max_len = 1800) => {
	const q_embeddings = await openai.createEmbedding({
	input: question,
	model: 'text-embedding-ada-002',
	})

	const embeddings = q_embeddings.data.data[0].embedding
	const sortedDf = [...df].map((row) => {
	return {
	...row,
	distance: distance(embeddings, row.embeddings)
	}
	}).sort((a, b) => a.distance - b.distance)

	const returns = []
	let cur_len = 0

	for (let i = 0; i < sortedDf.length; i++) {
	const row = df[i]
	cur_len += row.n_tokens + 4

	if (cur_len > max_len) {
	break
	}

	returns.push(row.text)
	}

	return returns.join('\n\n###\n\n')
	}

	const answer_question = async (question, max_len = 1800) => {
	const context = await create_context(question, max_len)

	console.log('Context:\n' + context + '\n\n')

	console.log('Question:\n' + question + '\n\n')

	console.log('Answering...')

	try {
	const response = await openai.createCompletion({
	prompt: `Responda a pergunta com base no contexto abaixo e, se a pergunta não puder ser respondida com base no contexto, diga "Não sei"\n\nContexto: ${context}\n\n---\n\nPergunta: ${question}\nResposta:`,
	temperature: 0,
	max_tokens: 150,
	top_p: 1,
	frequency_penalty: 0,
	presence_penalty: 0,
	stop: null,
	model: 'text-davinci-003',
	})
	return response.data.choices[0].text.trim()
	} catch (e) {
	console.log(e)
	return ''
	}
	}

	console.log('Answering...');

	// Read from stdin
	process.stdin.resume();
	process.stdin.setEncoding('utf8');

	// Read from stdin
	process.stdin.on('data', async function (chunk) {
	const question = chunk.trim()
	if (question) {
	console.log(await answer_question(question))
	}
	});