Skip to content

Instantly share code, notes, and snippets.

Last active October 28, 2022 16:23
Show Gist options
  • Save binarymax/b08d07a6a57d47c920376e86a92c7596 to your computer and use it in GitHub Desktop.
Save binarymax/b08d07a6a57d47c920376e86a92c7596 to your computer and use it in GitHub Desktop.
export NEURAL_HOST=$1
if [ -z "$1" ]
export NEURAL_HOST=localhost
node load_icecat.js --name icecat --files vectors/icecat.jsonl/ --host $NEURAL_HOST
// Formats and loads vector files into Solr
// Copyright 2022, Max Irwin
// MIT License
import fs from "fs";
import fetch from "node-fetch";
import progress from "progress";
import { v4 as uuidv4} from "uuid";
import { program } from "commander";
import { createClient } from "solr-client";
.option('-f, --files <string>')
.option('-s, --sitemap <string>')
.option('-n, --name <string>')
.option('-h, --host <string>')
const options = program.opts();
function clean_filename(filename) {
return filename.replace(/[:\/]+/g,'_')
let vector_files = null;
if (!options.files) {
if (!options.sitemap) {
console.error("You must specify the path to the vector files OR a sitemap.xml url!");
} else {
vector_files = `vectors/${clean_filename(options.sitemap)}/`;
} else {
vector_files = options.files;
if (! {
console.error("You must specify the site name!");
const host =||"localhost";
const site =; //"outdoors";
const ignore_fields = ["vectors","texts","entailed","paragraphs","context","body"];
const batch_size = 10;
const collections_api = `http://${host}:8983/solr/admin/collections`;
const schema_api = `http://${host}:8983/api/collections/${site}/schema`;
async function request(url,body,method){
method = method || "GET";
let response;
if (method === "POST") {
try {
response = await fetch(url, {
method: 'POST',
body: JSON.stringify(body),
headers: {
'Accept': 'application/json',
'Content-Type': 'application/json'
} catch(ex) {
const output = null;
return [ex,output];
} else {
if (body) url += "?" + new URLSearchParams(body).toString();
try {
response = await fetch(url);
} catch(ex) {
const output = null;
return [ex,output];
try {
const output = await response.json();
return [null,output];
} catch(ex) {
const output = null;
return [ex,output];
function sleep(ms) {
return new Promise((resolve) => {
setTimeout(resolve, ms);
async function be_patient(site) {
//Check if collection exists
let exists_result = await request(collections_api,{"action":"LIST","name":site});
while (exists_result[0]) {
await sleep(1000);
//console.error("Could not connect to Solr. The server might not be started yet. Retrying...");
exists_result = await request(collections_api,{"action":"LIST","name":site});
let exists = (
return exists;
//Make a new collection
async function create_collection(DELETE) {
let exists = await be_patient(site);
//Delete it if forced
if (DELETE && exists) {
let delete_result = await request(collections_api,{"action":"DELETE","name":site});
console.log(`Deleted collection ${site}`);
exists = false;
//Create the collection and vector fieldtype and field
if (!exists) {
console.log(`Creating collection ${site}`);
let create_result = await request(collections_api,{"action":"CREATE","name":site,"numShards":1,"replicationFactor":1});
if (!create_result[0]) {
let vector_field_type = {
let vector_field = {
let field_type_result = await request(schema_api,vector_field_type,'POST');
if (field_type_result[0]) {
console.error("Could not create field type!");
let field_result = await request(schema_api,vector_field,'POST');
if (field_result[0]) {
console.error("Could not create field!");
} else {
console.error("Could not create collection!");
while(!(await be_patient(site))) await sleep(1000);
console.log(`Collection ${site} Created`);
let client = createClient({host:host, port:8983, core:site});
client.autoCommit = true;
return client;
function get_files(path) {
let files = [];
let filenames = fs.readdirSync(path);
for(var j=0;j<filenames.length;j++) {
if(filenames[j].indexOf(".json")>0) {
"filename":path + filenames[j],
return files;
function get_documents(files,ignore) {
let payloads = [];
let documents = [];
let id = 0;
for (var i=0;i<files.length;i++) {
let doc = JSON.parse(fs.readFileSync(files[i].filename,"utf-8"));
if (doc && doc && doc.vectors && doc.vectors.length) {
//For each vector and text pair, create a Solr point that we will eventually send to the search engine
for(var j=0;j<doc.vectors.length;j++) {
let vec = doc.vectors[j];
let txt = doc.texts[j];
if (vec.length) {
//Each document body might have been split up if it was long.
//We'll create a separate point for each part of the vectorized content.
let precision = 1000000;
let vector =>Math.floor(f32*precision)/precision);
let text = txt;
let docid = + "_" + j;
return documents;
//Create the collection!
let client = await create_collection(false);
//Get and transform the files into Solr docs.
let files = get_files(vector_files);
let documents = get_documents(files,ignore_fields);
//Add documents to Solr in batches of 10
let bar = new progress("Indexing [:bar] :percent remaining::etas elapsed::elapsed (:current/:total)", {complete: "=", incomplete: " ", width: 50, total: parseInt(documents.length/batch_size)+1});
for(var p=0;p<documents.length;p+=batch_size) {
// Add the batch to Solr
let batch = documents.slice(p,p+batch_size);
const obj = await client.add(batch);
let commit = await request(`http://${host}:8983/solr/${site}/update?commit=true`);
"name": "neural-solr-tools",
"version": "1.0.0",
"description": "Instant neural search for Solr 9, using Mighty Inference!",
"main": "load.js",
"type": "module",
"scripts": {
"test": "echo \"Error: no test specified\" && exit 1"
"author": "Max Irwin",
"license": "MIT",
"dependencies": {
"commander": "^9.1.0",
"express": "^4.17.3",
"html-entities": "^2.3.3",
"node-mighty": "^1.1.0",
"progress": "^2.0.3",
"solr-client": "^0.10.0-rc10",
"string-strip-html": "^9.1.7",
"uuid": "^8.3.2",
"xml2js": "^0.4.23"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment