Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Star 20 You must be signed in to star a gist
  • Fork 4 You must be signed in to fork a gist
  • Save hrishioa/7654bd52a617465b0da40443a1a80a0c to your computer and use it in GitHub Desktop.
Save hrishioa/7654bd52a617465b0da40443a1a80a0c to your computer and use it in GitHub Desktop.
Simple Typescript file demonstrating chunked, chained LLM calls to process large amounts of text.
// Requires the gpt library from https://github.com/hrishioa/socrate and the progress bar library.
// Created by Hrishi Olickel (hrishioa@gmail.com) (@hrishioa). Reach out if you have trouble running this.
import { ThunkQueue } from '../../utils/simplethrottler';
import {
AcceptedModels,
Messages,
askChatGPT,
getMessagesTokenCount,
getProperJSONFromGPT,
modelProperties,
} from '../base';
const cliProgress = require('cli-progress');
const colors = require('ansi-colors');
import fs from 'fs';
import { Browser, Page, PlaywrightWebBaseLoader } from 'langchain/document_loaders/web/playwright';
type License = {
licenseName: string;
licenseContent: string;
licenseContentParts?: string[];
processedAnswers?: ProcessedAnswers | string;
};
type ProcessedAnswers = {
commercialConditions: string; // What conditions must be followed for commercial use? "" if there are no conditions.
downstreamChanges: string; // Does using code licensed under this license require any changes to the licensing of the derivative work?
persistent: boolean; // Should all derivative work that uses code licensed under this license, also be distributed under the same license?
viral: boolean; // Viral effect means that combining copyleft licensed work with a work licensed under a different license leads to the resulting work (an aggregate work) falling under the copyleft license. Is this license viral?
requirePublish: boolean; // Does this license require that the source code be published?
};
const PROCESSED_ANSWERS_SPEC = `type ProcessedAnswers = {
commercialConditions: string; // What conditions must be followed for commercial use? "" if there are no conditions.
downstreamChanges: string; // Does using code licensed under this license require any changes to the licensing of the derivative work?
persistent: boolean; // Should all derivative work that uses code licensed under this license, also be distributed under the same license?
viral: boolean; // Viral effect means that combining copyleft licensed work with a work licensed under a different license leads to the resulting work (an aggregate work) falling under the copyleft license. Is this license viral?
requirePublish: boolean; // Does this license require that the source code be published?
}`;
const DEBUG = process.env.COPILOT_IS_DEBUG === 'true';
export async function getLicenseList() {
const loader = new PlaywrightWebBaseLoader('https://spdx.org/licenses/', {
launchOptions: {
headless: true,
},
gotoOptions: {
waitUntil: 'domcontentloaded',
},
async evaluate(page: Page, browser: Browser) {
return await page.evaluate(() => {
return [].map.call(document.querySelectorAll('[typeof="spdx:License"]'), function (licensetag) {
return licensetag.textContent
}).join('~')
})
},
});
const licenseStrs = await loader.scrape();
const licenses = licenseStrs.split('~').map((licenseStr) => {
return licenseStr.replace(/[\s\n]/g, '')
});
return licenses;
}
export async function getLicense(licenseId: string) {
const loader = new PlaywrightWebBaseLoader(`https://spdx.org/licenses/${licenseId}.html`, {
launchOptions: {
headless: true,
},
gotoOptions: {
waitUntil: 'domcontentloaded',
},
async evaluate(page: Page, browser: Browser) {
return await page.evaluate(() => {
return JSON.stringify({
licenseName: document.querySelector('[property="spdx:name"]')!.textContent,
licenseContent: document.querySelector('[property="spdx:licenseText"]')!.textContent,
})
})
},
});
const licenseStr = await loader.scrape();
const license = JSON.parse(licenseStr);
return license;
}
function splitLicenseIntoParagraphs(licenseText: string): string[] {
// Split the text into paragraphs based on multiple consecutive line breaks
const paragraphs = licenseText.split(/\n\s*\n/);
// Remove any leading or trailing whitespace from each paragraph
return paragraphs.map((paragraph) => paragraph.trim().replace(/\s+/, ' '));
}
async function processLicenseWithGPT(
license: License,
previousAnswers?: string
): Promise<ProcessedAnswers | string | null> {
const BASEMODEL: AcceptedModels = 'gpt-3.5-turbo';
const LICENSE_CONTENT_TOKEN_LIMIT =
modelProperties['gpt-3.5-turbo'].tokenLimit - 1000;
if (!license.licenseContentParts)
license.licenseContentParts = splitLicenseIntoParagraphs(
license.licenseContent
);
// prettier-ignore
const prompts = {
systemPrompt: (licenseContent: string, licenseName: string) =>
`You are a commercial license processor that can only output valid JSON.
LICENSE_NAME: ${licenseName}
LICENSE_CONTENT_PART:
\`\`\`
${licenseContent}
\`\`\`
`,
startingPrompt: (previousAnswers?: string) =>
`PROCESSED_ANSWERS_SPEC:
\`\`\`typescript
${PROCESSED_ANSWERS_SPEC}
\`\`\`
ANSWERS_FOR_PREVIOUS_PARTS:
${previousAnswers ? previousAnswers : 'None'}
LICENSE_CONTENT_PART contains part of a code license. ANSWERS_FOR_PREVIOUS_PARTS contains ProcessedAnswers about the previous parts of the license. Use ANSWERS_FOR_PREVIOUS_PARTS and LICENSE_CONTENT_PART to generate a new JSON in the spec of PROCESSED_ANSWERS_SPEC, answering the questions therein.
Processed Answers JSON:
{
`
}
let trimmedLicenseContent = '';
let trimmedLicenseTokenCount = 0;
let remainingPartsToProcess: string[] = [];
for (let i = 0; i < license.licenseContentParts.length; i++) {
const licensePartTokenCount = getMessagesTokenCount([
{
role: 'system',
content: license.licenseContentParts[i],
},
]);
if (
trimmedLicenseTokenCount + licensePartTokenCount <
LICENSE_CONTENT_TOKEN_LIMIT
) {
trimmedLicenseContent += license.licenseContentParts[i];
trimmedLicenseTokenCount += licensePartTokenCount;
} else {
remainingPartsToProcess = license.licenseContentParts.slice(i);
break;
}
}
const messages: Messages = [
{
role: 'system',
content: prompts.systemPrompt(trimmedLicenseContent, license.licenseName),
},
{
role: 'user',
content: prompts.startingPrompt(previousAnswers),
},
];
if (DEBUG)
console.log(
'Processing part starting with ',
trimmedLicenseContent.slice(0, 100),
'...'
);
const result = await askChatGPT(messages, BASEMODEL, undefined, undefined, 1);
if (result.response.type === 'completeMessage') {
result.response.completeMessage = '{' + result.response.completeMessage;
if (DEBUG) console.log('Got {', result.response.completeMessage, '.');
if (remainingPartsToProcess.length > 0) {
return await processLicenseWithGPT(
{ ...license, licenseContentParts: remainingPartsToProcess },
result.response.completeMessage
);
} else {
try {
const processedAnswers: ProcessedAnswers = JSON.parse(
result.response.completeMessage
);
return processedAnswers;
} catch (err) {
const betterJSON = await getProperJSONFromGPT(
result.response.completeMessage,
1
);
if (betterJSON.success) {
if (DEBUG)
console.log('JSON coercion got us ', betterJSON.extractedJSON, '.');
return betterJSON.extractedJSON;
} else {
if (DEBUG)
console.error(
'Error processing ',
result.response.completeMessage,
' - ',
err
);
return result.response.completeMessage;
}
}
}
} else {
console.error(
'Error processing ',
license.licenseName,
' - ',
result.response
);
return null;
}
}
async function loadLicenses() {
console.log('Getting license list...');
const licenseList = await getLicenseList();
console.log('Downloading licenses...');
const pBar = new cliProgress.SingleBar({
format:
'Downloading Licenses |' +
colors.cyan('{bar}') +
'| {percentage}% || {value}/{total} Licenses ({eta}s left) || Errored: {errorCount} Current: {licenseId}',
barCompleteChar: '\u2588',
barIncompleteChar: '\u2591',
hideCursor: true,
});
let errorCount = 0;
pBar.start(licenseList.length, 0, {
licenseId: licenseList[0],
errorCount: errorCount,
});
const licenses = fs.existsSync('./tmp_data/licenses.json')
? JSON.parse(fs.readFileSync('./tmp_data/licenses.json', 'utf8'))
: {};
const erroredLicenses: string[] = [];
const licenseQueue = new ThunkQueue(50);
for (let i = 0; i < licenseList.length; i++) {
const licenseId = licenseList[i];
if(i > 50) break; // This is just to make sure we don't keep ringing up super costly GPT-3 charges
licenseQueue.add(async () => {
try {
licenses[licenseId] = await getLicense(licenseId);
fs.writeFileSync(
'./tmp_data/licenses.json',
JSON.stringify(licenses, null, 2)
);
} catch (err) {
erroredLicenses.push(licenseId);
errorCount++;
}
pBar.increment(1, {
licenseId: licenseId,
errorCount: errorCount,
});
});
}
await licenseQueue.waitForAll();
}
async function processLicenses() {
const licenseList: { [key: string]: License } = fs.existsSync(
'./tmp_data/licenses.json'
)
? JSON.parse(fs.readFileSync('./tmp_data/licenses.json', 'utf8'))
: {};
const processedLicenseList: { [key: string]: License } = fs.existsSync(
'./tmp_data/processedLicenses.json'
)
? JSON.parse(fs.readFileSync('./tmp_data/processedLicenses.json', 'utf8'))
: {};
const pBar = new cliProgress.SingleBar({
format:
'Processing Licenses |' +
colors.cyan('{bar}') +
'| {percentage}% || {value}/{total} Licenses ({eta}s left) || Errored: {errorCount}, Succeeded: {successCount} Current: {licenseId}',
barCompleteChar: '\u2588',
barIncompleteChar: '\u2591',
hideCursor: true,
});
let errorCount = 0,
successCount = 0;
pBar.start(Object.keys(licenseList).length, 0, {
licenseId: 'None',
errorCount: errorCount,
successCount: successCount,
});
const licenseQueue = new ThunkQueue(1);
for (const licenseId of Object.keys(licenseList)) {
const license = licenseList[licenseId];
licenseQueue.add(async () => {
if (!processedLicenseList[licenseId]) {
const processedAnswers = await processLicenseWithGPT(license);
if (processedAnswers === null || typeof processedAnswers === 'string') {
errorCount++;
} else {
license.processedAnswers = processedAnswers;
processedLicenseList[licenseId] = license;
successCount++;
fs.writeFileSync(
'./tmp_data/processedLicenses.json',
JSON.stringify(processedLicenseList, null, 2)
);
}
}
pBar.increment(1, {
licenseId: licenseId,
errorCount: errorCount,
successCount: successCount,
});
});
}
await licenseQueue.waitForAll();
}
(async function loadAndProcessLicenses() {
await loadLicenses();
// await processLicenses();
})();
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment