Simple Typescript file demonstrating chunked, chained LLM calls to process large amounts of text.
// Requires the gpt library from and the progress bar library.
// Created by Hrishi Olickel ( (@hrishioa). Reach out if you have trouble running this.
import { ThunkQueue } from '../../utils/simplethrottler';
import {
} from '../base';
const cliProgress = require('cli-progress');
const colors = require('ansi-colors');
import fs from 'fs';
import { Browser, Page, PlaywrightWebBaseLoader } from 'langchain/document_loaders/web/playwright';
type License = {
licenseName: string;
licenseContent: string;
licenseContentParts?: string[];
processedAnswers?: ProcessedAnswers | string;
type ProcessedAnswers = {
commercialConditions: string; // What conditions must be followed for commercial use? "" if there are no conditions.
downstreamChanges: string; // Does using code licensed under this license require any changes to the licensing of the derivative work?
persistent: boolean; // Should all derivative work that uses code licensed under this license, also be distributed under the same license?
viral: boolean; // Viral effect means that combining copyleft licensed work with a work licensed under a different license leads to the resulting work (an aggregate work) falling under the copyleft license. Is this license viral?
requirePublish: boolean; // Does this license require that the source code be published?
const PROCESSED_ANSWERS_SPEC = `type ProcessedAnswers = {
commercialConditions: string; // What conditions must be followed for commercial use? "" if there are no conditions.
downstreamChanges: string; // Does using code licensed under this license require any changes to the licensing of the derivative work?
persistent: boolean; // Should all derivative work that uses code licensed under this license, also be distributed under the same license?
viral: boolean; // Viral effect means that combining copyleft licensed work with a work licensed under a different license leads to the resulting work (an aggregate work) falling under the copyleft license. Is this license viral?
requirePublish: boolean; // Does this license require that the source code be published?
const DEBUG = process.env.COPILOT_IS_DEBUG === 'true';
export async function getLicenseList() {
const loader = new PlaywrightWebBaseLoader('', {
launchOptions: {
headless: true,
gotoOptions: {
waitUntil: 'domcontentloaded',
async evaluate(page: Page, browser: Browser) {
return await page.evaluate(() => {
return []'[typeof="spdx:License"]'), function (licensetag) {
return licensetag.textContent
const licenseStrs = await loader.scrape();
const licenses = licenseStrs.split('~').map((licenseStr) => {
return licenseStr.replace(/[\s\n]/g, '')
return licenses;
export async function getLicense(licenseId: string) {
const loader = new PlaywrightWebBaseLoader(`${licenseId}.html`, {
launchOptions: {
headless: true,
gotoOptions: {
waitUntil: 'domcontentloaded',
async evaluate(page: Page, browser: Browser) {
return await page.evaluate(() => {
return JSON.stringify({
licenseName: document.querySelector('[property="spdx:name"]')!.textContent,
licenseContent: document.querySelector('[property="spdx:licenseText"]')!.textContent,
const licenseStr = await loader.scrape();
const license = JSON.parse(licenseStr);
return license;
function splitLicenseIntoParagraphs(licenseText: string): string[] {
// Split the text into paragraphs based on multiple consecutive line breaks
const paragraphs = licenseText.split(/\n\s*\n/);
// Remove any leading or trailing whitespace from each paragraph
return => paragraph.trim().replace(/\s+/, ' '));
async function processLicenseWithGPT(
license: License,
previousAnswers?: string
): Promise<ProcessedAnswers | string | null> {
const BASEMODEL: AcceptedModels = 'gpt-3.5-turbo';
modelProperties['gpt-3.5-turbo'].tokenLimit - 1000;
if (!license.licenseContentParts)
license.licenseContentParts = splitLicenseIntoParagraphs(
// prettier-ignore
const prompts = {
systemPrompt: (licenseContent: string, licenseName: string) =>
`You are a commercial license processor that can only output valid JSON.
LICENSE_NAME: ${licenseName}
startingPrompt: (previousAnswers?: string) =>
${previousAnswers ? previousAnswers : 'None'}
LICENSE_CONTENT_PART contains part of a code license. ANSWERS_FOR_PREVIOUS_PARTS contains ProcessedAnswers about the previous parts of the license. Use ANSWERS_FOR_PREVIOUS_PARTS and LICENSE_CONTENT_PART to generate a new JSON in the spec of PROCESSED_ANSWERS_SPEC, answering the questions therein.
Processed Answers JSON:
let trimmedLicenseContent = '';
let trimmedLicenseTokenCount = 0;
let remainingPartsToProcess: string[] = [];
for (let i = 0; i < license.licenseContentParts.length; i++) {
const licensePartTokenCount = getMessagesTokenCount([
role: 'system',
content: license.licenseContentParts[i],
if (
trimmedLicenseTokenCount + licensePartTokenCount <
) {
trimmedLicenseContent += license.licenseContentParts[i];
trimmedLicenseTokenCount += licensePartTokenCount;
} else {
remainingPartsToProcess = license.licenseContentParts.slice(i);
const messages: Messages = [
role: 'system',
content: prompts.systemPrompt(trimmedLicenseContent, license.licenseName),
role: 'user',
content: prompts.startingPrompt(previousAnswers),
if (DEBUG)
'Processing part starting with ',
trimmedLicenseContent.slice(0, 100),
const result = await askChatGPT(messages, BASEMODEL, undefined, undefined, 1);
if (result.response.type === 'completeMessage') {
result.response.completeMessage = '{' + result.response.completeMessage;
if (DEBUG) console.log('Got {', result.response.completeMessage, '.');
if (remainingPartsToProcess.length > 0) {
return await processLicenseWithGPT(
{ ...license, licenseContentParts: remainingPartsToProcess },
} else {
try {
const processedAnswers: ProcessedAnswers = JSON.parse(
return processedAnswers;
} catch (err) {
const betterJSON = await getProperJSONFromGPT(
if (betterJSON.success) {
if (DEBUG)
console.log('JSON coercion got us ', betterJSON.extractedJSON, '.');
return betterJSON.extractedJSON;
} else {
if (DEBUG)
'Error processing ',
' - ',
return result.response.completeMessage;
} else {
'Error processing ',
' - ',
return null;
async function loadLicenses() {
console.log('Getting license list...');
const licenseList = await getLicenseList();
console.log('Downloading licenses...');
const pBar = new cliProgress.SingleBar({
'Downloading Licenses |' +
colors.cyan('{bar}') +
'| {percentage}% || {value}/{total} Licenses ({eta}s left) || Errored: {errorCount} Current: {licenseId}',
barCompleteChar: '\u2588',
barIncompleteChar: '\u2591',
hideCursor: true,
let errorCount = 0;
pBar.start(licenseList.length, 0, {
licenseId: licenseList[0],
errorCount: errorCount,
const licenses = fs.existsSync('./tmp_data/licenses.json')
? JSON.parse(fs.readFileSync('./tmp_data/licenses.json', 'utf8'))
: {};
const erroredLicenses: string[] = [];
const licenseQueue = new ThunkQueue(50);
for (let i = 0; i < licenseList.length; i++) {
const licenseId = licenseList[i];
if(i > 50) break; // This is just to make sure we don't keep ringing up super costly GPT-3 charges
licenseQueue.add(async () => {
try {
licenses[licenseId] = await getLicense(licenseId);
JSON.stringify(licenses, null, 2)
} catch (err) {
pBar.increment(1, {
licenseId: licenseId,
errorCount: errorCount,
await licenseQueue.waitForAll();
async function processLicenses() {
const licenseList: { [key: string]: License } = fs.existsSync(
? JSON.parse(fs.readFileSync('./tmp_data/licenses.json', 'utf8'))
: {};
const processedLicenseList: { [key: string]: License } = fs.existsSync(
? JSON.parse(fs.readFileSync('./tmp_data/processedLicenses.json', 'utf8'))
: {};
const pBar = new cliProgress.SingleBar({
'Processing Licenses |' +
colors.cyan('{bar}') +
'| {percentage}% || {value}/{total} Licenses ({eta}s left) || Errored: {errorCount}, Succeeded: {successCount} Current: {licenseId}',
barCompleteChar: '\u2588',
barIncompleteChar: '\u2591',
hideCursor: true,
let errorCount = 0,
successCount = 0;
pBar.start(Object.keys(licenseList).length, 0, {
licenseId: 'None',
errorCount: errorCount,
successCount: successCount,
const licenseQueue = new ThunkQueue(1);
for (const licenseId of Object.keys(licenseList)) {
const license = licenseList[licenseId];
licenseQueue.add(async () => {
if (!processedLicenseList[licenseId]) {
const processedAnswers = await processLicenseWithGPT(license);
if (processedAnswers === null || typeof processedAnswers === 'string') {
} else {
license.processedAnswers = processedAnswers;
processedLicenseList[licenseId] = license;
JSON.stringify(processedLicenseList, null, 2)
pBar.increment(1, {
licenseId: licenseId,
errorCount: errorCount,
successCount: successCount,
await licenseQueue.waitForAll();
(async function loadAndProcessLicenses() {
await loadLicenses();
// await processLicenses();
