navidshad/firestore_cursor.js

## firestore_cursor.js
/*
	How can we read all data from a firestore collection?

	Problem:
		When you are reading a hug collection doc by doc from Firebase these problems happens for you:

		1. firestore cursor is not proper as a powerful cursor as to be able make this task done.
		2. after few thousands request you will get Bandwidth Exhausted error.

	Solution
		You need 2 hings for resolving abou problems:

		1. A powerfull cursor
		2. A child-process manager

	Why cursor?
		firebase default cursor cant read all data automatically and needs your interaction to tell it go next part, gor next part...
		so we need a cursor to tell firebase cursor go next part automatically.

	Why child-process?
		After reading thousands docs Bandwidth Exhausted error will happened,
		I figured out if we stop running process and run a new process we can pass the error and continue the job.
		so we need a child-process manager to do it for us.

	For implementing these two things we need two scripts: parent (index.js) and child (cursor.js)
	Parent is the child-process manager. Child is the cursor. they can send message for each other and have colaboration.

	This is the scenario:

		- Parent got collection detail from developer
		- Then run a cursor as a child-process and provides some basic info about the collection
		- Then child goes through a collection and read documents until it gets [8 RESOURCE_EXHAUSTED] error, then send a message to parent to inform it from the error.
		- Then parent kills child and create a new one and tells it where to start reading again.

*/

//
// Parent Script =============================
// ===========================================
// you can call it index.js
//
const childProcess = require('child_process');
const path = require('path');

async function readAllDocs({
	// collection name
	collection,
	// you should provide a key to sort all docs by it
	orderBy,
	// total documents on each request
	limit = 1000,
	// it's a callback being called per document
	onDoc,
}) {
	return new Promise(async (done) => {

		let allowContinue = true;
		let lastId = null;
		let counter = 0;

		//
		// This is a scoped function being called continuously
		// it will run the child script and listen to its messages
		// each time the error of Bandwidth Exhausted happened it kills the child script and run it again
		// until there were no documents anymore
		const runChild = () => {

			console.log('run cursor for', collection);

			return new Promise(async (resolve) => {
				// Running new child process
				const child = childProcess
					.fork(
						// path to child script
						path.join(__dirname, 'cursor.js'),
						// process options
						{
							// pass a set of environment variables
							// to child process
							env: {
								...process.env,
								lastId, collection, limit, orderBy
							},
						},
					)

				// listen to child messages
				child.on('message', async (data) => {
					// When message contains a document
					if (data.type == 'DOC') {
						if (onDoc) onDoc(data.doc, counter);
						counter++
					}
					// When reading collection has been done
					else if (data.type == 'DONE') {
						child.kill()
						// make this false to stop the while cycle
						allowContinue = false
						resolve()
					}
					// When "Bandwidth Exhausted" error happened
					// this child will be killed
					else if (data.type == 'ERROR') {
						child.kill()
						// make this true to continue the while cycle
						allowContinue = true;
						// store last document id for running a new child
						lastId = data.lastId;
						resolve();
					}
				})
			})
		}

		// start new child process while condition is true
		while (allowContinue) {
			await runChild();
		}

		// stop reading
		done();
	})
}

// Now you can start to read a collection
readAllDocs({
	collection: 'users',
	orderBy: 'uid',
	onDoc: (doc, index) => {
		// do something with the current doc
	}
})

//
// Child Script ==============================
// ===========================================
// you can call it cursor.js
//
const admin = require("firebase-admin");
admin.initializeApp({});

async function runCursor({
	// collection name
	collection,
	// total documents on each call
	limit = 1000,
	// on document read
	onDoc,
	onDone,
}) {
	let lastDoc;
	let lastId = process.env.lastId || null;
	let allowGoAhead = true;
	let orderBy = process.env.orderBy;

	if (lastId) {
		// Get last document from last killed process
		await admin.firestore().collection(collection).doc(lastId).get()
			.then(sp => {
				if (sp.exists) lastDoc = sp
			})
	}

	// this is a inner function
	// it will be used in while section
	const getDocs = () => {

		let query = admin.firestore().collection(collection).orderBy(orderBy).limit(limit)

		if (lastDoc) {
			// define where to start to read
			// last doc exists
			query = query.startAfter(lastDoc)
		}

		return query.get().then(sp => {

			if (sp.docs.length > 0) {

				for (let i = 0; i < sp.docs.length; i++) {
					const doc = sp.docs[i];

					// run onDoc call back
					if (onDoc) onDoc(doc);
				}

				// define end of this part
				lastDoc = sp.docs[sp.docs.length - 1]
				// continue the cursor
				allowGoAhead = true
			} else {
				// stop cursor if there is not more docs
				allowGoAhead = false;
			}
		}).catch(error => {
			console.log(error);

			// Inform parent process from this error
			process.send({ type: 'ERROR', lastId: lastDoc.id, error });
		})
	}

	while (allowGoAhead) {
		await getDocs();
	}

	onDone();
}

runCursor({
	collection: process.env.collection,
	limit: parseInt(process.env.limit),
	onDoc: (doc) => {
		process.send({ type: 'DOC', doc: doc.data() });
	},
	onDone: () => {
		process.send({ type: 'DONE' });
	}
});

/*
	That's it
	Use these scripts to read all documents from a firestore collection
*/
	/*
	How can we read all data from a firestore collection?

	Problem:
	When you are reading a hug collection doc by doc from Firebase these problems happens for you:

	1. firestore cursor is not proper as a powerful cursor as to be able make this task done.
	2. after few thousands request you will get Bandwidth Exhausted error.

	Solution
	You need 2 hings for resolving abou problems:

	1. A powerfull cursor
	2. A child-process manager

	Why cursor?
	firebase default cursor cant read all data automatically and needs your interaction to tell it go next part, gor next part...
	so we need a cursor to tell firebase cursor go next part automatically.

	Why child-process?
	After reading thousands docs Bandwidth Exhausted error will happened,
	I figured out if we stop running process and run a new process we can pass the error and continue the job.
	so we need a child-process manager to do it for us.

	For implementing these two things we need two scripts: parent (index.js) and child (cursor.js)
	Parent is the child-process manager. Child is the cursor. they can send message for each other and have colaboration.

	This is the scenario:

	- Parent got collection detail from developer
	- Then run a cursor as a child-process and provides some basic info about the collection
	- Then child goes through a collection and read documents until it gets [8 RESOURCE_EXHAUSTED] error, then send a message to parent to inform it from the error.
	- Then parent kills child and create a new one and tells it where to start reading again.

	*/

	//
	// Parent Script =============================
	// ===========================================
	// you can call it index.js
	//
	const childProcess = require('child_process');
	const path = require('path');

	async function readAllDocs({
	// collection name
	collection,
	// you should provide a key to sort all docs by it
	orderBy,
	// total documents on each request
	limit = 1000,
	// it's a callback being called per document
	onDoc,
	}) {
	return new Promise(async (done) => {

	let allowContinue = true;
	let lastId = null;
	let counter = 0;

	//
	// This is a scoped function being called continuously
	// it will run the child script and listen to its messages
	// each time the error of Bandwidth Exhausted happened it kills the child script and run it again
	// until there were no documents anymore
	const runChild = () => {

	console.log('run cursor for', collection);

	return new Promise(async (resolve) => {
	// Running new child process
	const child = childProcess
	.fork(
	// path to child script
	path.join(__dirname, 'cursor.js'),
	// process options
	{
	// pass a set of environment variables
	// to child process
	env: {
	...process.env,
	lastId, collection, limit, orderBy
	},
	},
	)

	// listen to child messages
	child.on('message', async (data) => {
	// When message contains a document
	if (data.type == 'DOC') {
	if (onDoc) onDoc(data.doc, counter);
	counter++
	}
	// When reading collection has been done
	else if (data.type == 'DONE') {
	child.kill()
	// make this false to stop the while cycle
	allowContinue = false
	resolve()
	}
	// When "Bandwidth Exhausted" error happened
	// this child will be killed
	else if (data.type == 'ERROR') {
	child.kill()
	// make this true to continue the while cycle
	allowContinue = true;
	// store last document id for running a new child
	lastId = data.lastId;
	resolve();
	}
	})
	})
	}

	// start new child process while condition is true
	while (allowContinue) {
	await runChild();
	}

	// stop reading
	done();
	})
	}

	// Now you can start to read a collection
	readAllDocs({
	collection: 'users',
	orderBy: 'uid',
	onDoc: (doc, index) => {
	// do something with the current doc
	}
	})

	//
	// Child Script ==============================
	// ===========================================
	// you can call it cursor.js
	//
	const admin = require("firebase-admin");
	admin.initializeApp({});

	async function runCursor({
	// collection name
	collection,
	// total documents on each call
	limit = 1000,
	// on document read
	onDoc,
	onDone,
	}) {
	let lastDoc;
	let lastId = process.env.lastId \|\| null;
	let allowGoAhead = true;
	let orderBy = process.env.orderBy;

	if (lastId) {
	// Get last document from last killed process
	await admin.firestore().collection(collection).doc(lastId).get()
	.then(sp => {
	if (sp.exists) lastDoc = sp
	})
	}

	// this is a inner function
	// it will be used in while section
	const getDocs = () => {

	let query = admin.firestore().collection(collection).orderBy(orderBy).limit(limit)

	if (lastDoc) {
	// define where to start to read
	// last doc exists
	query = query.startAfter(lastDoc)
	}

	return query.get().then(sp => {

	if (sp.docs.length > 0) {

	for (let i = 0; i < sp.docs.length; i++) {
	const doc = sp.docs[i];

	// run onDoc call back
	if (onDoc) onDoc(doc);
	}

	// define end of this part
	lastDoc = sp.docs[sp.docs.length - 1]
	// continue the cursor
	allowGoAhead = true
	} else {
	// stop cursor if there is not more docs
	allowGoAhead = false;
	}
	}).catch(error => {
	console.log(error);

	// Inform parent process from this error
	process.send({ type: 'ERROR', lastId: lastDoc.id, error });
	})
	}

	while (allowGoAhead) {
	await getDocs();
	}

	onDone();
	}

	runCursor({
	collection: process.env.collection,
	limit: parseInt(process.env.limit),
	onDoc: (doc) => {
	process.send({ type: 'DOC', doc: doc.data() });
	},
	onDone: () => {
	process.send({ type: 'DONE' });
	}
	});

	/*
	That's it
	Use these scripts to read all documents from a firestore collection
	*/