Skip to content

Instantly share code, notes, and snippets.

@binarymax
Last active October 28, 2022 16:23
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save binarymax/b08d07a6a57d47c920376e86a92c7596 to your computer and use it in GitHub Desktop.
Save binarymax/b08d07a6a57d47c920376e86a92c7596 to your computer and use it in GitHub Desktop.
export NEURAL_HOST=$1
if [ -z "$1" ]
then
export NEURAL_HOST=localhost
fi
node load_icecat.js --name icecat --files vectors/icecat.jsonl/ --host $NEURAL_HOST
//
// Formats and loads vector files into Solr
// Copyright 2022, Max Irwin
// MIT License
//
import fs from "fs";
import fetch from "node-fetch";
import progress from "progress";
import { v4 as uuidv4} from "uuid";
import { program } from "commander";
import { createClient } from "solr-client";
program
.option('-f, --files <string>')
.option('-s, --sitemap <string>')
.option('-n, --name <string>')
.option('-h, --host <string>')
.parse();
const options = program.opts();
function clean_filename(filename) {
return filename.replace(/[:\/]+/g,'_')
}
let vector_files = null;
if (!options.files) {
if (!options.sitemap) {
console.error("You must specify the path to the vector files OR a sitemap.xml url!")
program.help();
process.exit(1);
} else {
vector_files = `vectors/${clean_filename(options.sitemap)}/`;
}
} else {
vector_files = options.files;
}
if (!options.name) {
console.error("You must specify the site name!")
program.help();
process.exit(1);
}
//Globals
const host = options.host||"localhost";
const site = options.name; //"outdoors";
const ignore_fields = ["vectors","texts","entailed","paragraphs","context","body"];
const batch_size = 10;
const collections_api = `http://${host}:8983/solr/admin/collections`;
const schema_api = `http://${host}:8983/api/collections/${site}/schema`;
async function request(url,body,method){
method = method || "GET";
let response;
if (method === "POST") {
try {
response = await fetch(url, {
method: 'POST',
body: JSON.stringify(body),
headers: {
'Accept': 'application/json',
'Content-Type': 'application/json'
}
});
} catch(ex) {
const output = null;
return [ex,output];
}
} else {
if (body) url += "?" + new URLSearchParams(body).toString();
try {
response = await fetch(url);
} catch(ex) {
const output = null;
return [ex,output];
}
}
try {
const output = await response.json();
return [null,output];
} catch(ex) {
const output = null;
return [ex,output];
}
}
function sleep(ms) {
return new Promise((resolve) => {
setTimeout(resolve, ms);
});
}
async function be_patient(site) {
//Check if collection exists
let exists_result = await request(collections_api,{"action":"LIST","name":site});
while (exists_result[0]) {
await sleep(1000);
//console.error("Could not connect to Solr. The server might not be started yet. Retrying...");
exists_result = await request(collections_api,{"action":"LIST","name":site});
}
let exists = (
exists_result[1]&&
exists_result[1].collections&&
exists_result[1].collections.indexOf(site)>=0
)?true:false;
return exists;
}
//Make a new collection
async function create_collection(DELETE) {
let exists = await be_patient(site);
//Delete it if forced
if (DELETE && exists) {
let delete_result = await request(collections_api,{"action":"DELETE","name":site});
console.log(`Deleted collection ${site}`);
exists = false;
}
//Create the collection and vector fieldtype and field
if (!exists) {
console.log(`Creating collection ${site}`);
let create_result = await request(collections_api,{"action":"CREATE","name":site,"numShards":1,"replicationFactor":1});
if (!create_result[0]) {
let vector_field_type = {
"add-field-type":{
"name":"hnsw",
"class":"solr.DenseVectorField",
"similarityFunction":"cosine",
"vectorDimension":768,
"codecFormat":"Lucene90HnswVectorsFormat",
"hnswMaxConnections":16,
"hnswBeamWidth":512
}
}
let vector_field = {
"add-field":{
"name":"vector",
"type":"hnsw",
"stored":true,
"indexed":true
}
}
let field_type_result = await request(schema_api,vector_field_type,'POST');
if (field_type_result[0]) {
console.error("Could not create field type!");
console.error(field_type_result[0]);
process.exit(1);
}
let field_result = await request(schema_api,vector_field,'POST');
if (field_result[0]) {
console.error("Could not create field!");
console.error(field_result[0]);
process.exit(1);
}
} else {
console.error("Could not create collection!");
console.error(create_result[0]);
process.exit(1);
}
}
while(!(await be_patient(site))) await sleep(1000);
console.log(`Collection ${site} Created`);
let client = createClient({host:host, port:8983, core:site});
client.autoCommit = true;
return client;
}
function get_files(path) {
let files = [];
let filenames = fs.readdirSync(path);
for(var j=0;j<filenames.length;j++) {
if(filenames[j].indexOf(".json")>0) {
files.push({
"filename":path + filenames[j],
});
}
}
return files;
}
function get_documents(files,ignore) {
let payloads = [];
let documents = [];
let id = 0;
for (var i=0;i<files.length;i++) {
let doc = JSON.parse(fs.readFileSync(files[i].filename,"utf-8"));
if (doc && doc && doc.vectors && doc.vectors.length) {
//For each vector and text pair, create a Solr point that we will eventually send to the search engine
for(var j=0;j<doc.vectors.length;j++) {
let vec = doc.vectors[j];
let txt = doc.texts[j];
if (vec.length) {
//Each document body might have been split up if it was long.
//We'll create a separate point for each part of the vectorized content.
let precision = 1000000;
let vector = vec.map(f32=>Math.floor(f32*precision)/precision);
let text = txt;
let docid = doc.id + "_" + j;
documents.push({
"id":docid,
"name_s":doc.name_s,
"title_txt_en":doc.title_txt_en,
"thumbUrl_s":doc.thumbUrl_s,
"longDescription_txt_en":doc.longDescription_txt_en,
"shortDescription_txt_en":doc.shortDescription_txt_en,
"longSummary_txt_en":doc.longSummary_txt_en,
"shortSummary_txt_en":doc.shortSummary_txt_en,
"features_ss":doc.features_ss,
"cat_s":doc.cat_s,
"vector":vector,
"text_txt_en":text
});
}
}
}
}
return documents;
}
//Create the collection!
let client = await create_collection(false);
//Get and transform the files into Solr docs.
let files = get_files(vector_files);
let documents = get_documents(files,ignore_fields);
//Add documents to Solr in batches of 10
let bar = new progress("Indexing [:bar] :percent remaining::etas elapsed::elapsed (:current/:total)", {complete: "=", incomplete: " ", width: 50, total: parseInt(documents.length/batch_size)+1});
for(var p=0;p<documents.length;p+=batch_size) {
// Add the batch to Solr
let batch = documents.slice(p,p+batch_size);
const obj = await client.add(batch);
bar.tick();
}
let commit = await request(`http://${host}:8983/solr/${site}/update?commit=true`);
{
"name": "neural-solr-tools",
"version": "1.0.0",
"description": "Instant neural search for Solr 9, using Mighty Inference!",
"main": "load.js",
"type": "module",
"scripts": {
"test": "echo \"Error: no test specified\" && exit 1"
},
"author": "Max Irwin",
"license": "MIT",
"dependencies": {
"commander": "^9.1.0",
"express": "^4.17.3",
"html-entities": "^2.3.3",
"node-mighty": "^1.1.0",
"progress": "^2.0.3",
"solr-client": "^0.10.0-rc10",
"string-strip-html": "^9.1.7",
"uuid": "^8.3.2",
"xml2js": "^0.4.23"
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment