M1ndBlast/pdf2txt.html

## pdf2txt.html
<!DOCTYPE html>
<html lang="en">
	<head>
		<meta charset="UTF-8" />
		<meta http-equiv="X-UA-Compatible" content="IE=edge" />
		<meta name="viewport" content="width=device-width, initial-scale=1.0" />
		<title>PDF Decoder to Text</title>
		<script src="https://cdn.jsdelivr.net/npm/pdfjs-dist@2.10.377/build/pdf.min.js"></script>
		<script src="pdf2txt.multiple-1.0.js"></script>
	</head>
	<body></body>
</html>

## pdf2txt.multiple-1.0.js
let PDFJS = window['pdfjs-dist/build/pdf']; // Loaded via <script> tag, create shortcut to access PDF.js exports.
PDFJS.GlobalWorkerOptions.workerSrc = 'https://cdn.jsdelivr.net/npm/pdfjs-dist@2.10.377/build/pdf.worker.min.js'; // The workerSrc property shall be specified.
function Pdf2TextClass() {
	var self = this;
	this.complete = 0;

	/**
	 * @param data ArrayBuffer of the pdf file content
	 * @param callbackPageDone To inform the progress each time
	 *        when a page is finished. The callback function's input parameters are:
	 *        1) number of pages done;
	 *        2) total number of pages in file.
	 * @param callbackAllDone The input parameter of callback function is
	 *        the result of extracted text from pdf file.
	 */
	this.pdfToText = function (data, callbackPageDone, callbackAllDone) {
		console.assert(data instanceof ArrayBuffer || typeof data == 'string', 'No data type allowed');
		PDFJS.getDocument(data).promise.then(function (pdf) {
			var div = document.getElementById('viewer');

			var total = pdf.numPages;
			callbackPageDone(0, total);
			var layers = {};
			for (i = 1; i <= total; i++) {
				pdf.getPage(i).then(function (page) {
					var n = page.pageNumber;
					page.getTextContent().then(function (textContent) {
						if (null != textContent.items) {
							var page_text = "";
							var last_block = null;
							for (var k = 0; k < textContent.items.length; k++) {
								var block = textContent.items[k];
								if (last_block != null && last_block.str[last_block.str.length - 1] != ' ')
									if (block.x < last_block.x)
										page_text += "\r\n";
									else if (last_block.y != block.y && (last_block.str.match(/^(\s?[a-zA-Z])$|^(.+\s[a-zA-Z])$/) == null))
										page_text += ' ';
								page_text += block.str;
								last_block = block;
							}

							textContent != null && console.log("page " + n + " finished."); //" content: \n" + page_text);
							layers[n] = page_text + "\n\n";
						}
						++self.complete;
						callbackPageDone(self.complete, total);
						if (self.complete == total) {
							window.setTimeout(function () {
								var full_text = "";
								var num_pages = Object.keys(layers).length;
								for (var j = 1; j <= num_pages; j++)
									full_text += layers[j];
								callbackAllDone(full_text);
							}, 1000);
						}
					}); // end  of page.getTextContent().then
				}); // end of page.then
			} // of for
		});
	}; // end of pdfToText()
}; // end of class


let input = document.createElement('input')
input.type = 'file'
input.multiple = true
input.accept = 'application/pdf'
input.addEventListener('input', function () {
	for (const file of this.files) {
		let reader = new FileReader();
		reader.onload = function () {
			let arrayBuffer = this.result

			new Pdf2TextClass().pdfToText(arrayBuffer, (pagNum, tPagNum) => { console.log(`${pagNum}/${tPagNum}`); }, text => {
				let downloadLink = document.createElement("a");
				downloadLink.href = 'data:text/txt;charset=utf-8,' + text;;
				downloadLink.download = file.name.substr(0, file.name.lastIndexOf('.')) + ".txt";

				document.body.appendChild(downloadLink);
				downloadLink.click();
				downloadLink.remove()
			})
		}
		reader.readAsArrayBuffer(file);
	}
})

document.addEventListener("DOMContentLoaded", _ =>
	document.querySelector('body').append(input))
	<!DOCTYPE html>
	<html lang="en">
	<head>
	<meta charset="UTF-8" />
	<meta http-equiv="X-UA-Compatible" content="IE=edge" />
	<meta name="viewport" content="width=device-width, initial-scale=1.0" />
	<title>PDF Decoder to Text</title>
	<script src="https://cdn.jsdelivr.net/npm/pdfjs-dist@2.10.377/build/pdf.min.js"></script>
	<script src="pdf2txt.multiple-1.0.js"></script>
	</head>
	<body></body>
	</html>
	let PDFJS = window['pdfjs-dist/build/pdf']; // Loaded via <script> tag, create shortcut to access PDF.js exports.
	PDFJS.GlobalWorkerOptions.workerSrc = 'https://cdn.jsdelivr.net/npm/pdfjs-dist@2.10.377/build/pdf.worker.min.js'; // The workerSrc property shall be specified.
	function Pdf2TextClass() {
	var self = this;
	this.complete = 0;

	/**
	* @param data ArrayBuffer of the pdf file content
	* @param callbackPageDone To inform the progress each time
	* when a page is finished. The callback function's input parameters are:
	* 1) number of pages done;
	* 2) total number of pages in file.
	* @param callbackAllDone The input parameter of callback function is
	* the result of extracted text from pdf file.
	*/
	this.pdfToText = function (data, callbackPageDone, callbackAllDone) {
	console.assert(data instanceof ArrayBuffer \|\| typeof data == 'string', 'No data type allowed');
	PDFJS.getDocument(data).promise.then(function (pdf) {
	var div = document.getElementById('viewer');

	var total = pdf.numPages;
	callbackPageDone(0, total);
	var layers = {};
	for (i = 1; i <= total; i++) {
	pdf.getPage(i).then(function (page) {
	var n = page.pageNumber;
	page.getTextContent().then(function (textContent) {
	if (null != textContent.items) {
	var page_text = "";
	var last_block = null;
	for (var k = 0; k < textContent.items.length; k++) {
	var block = textContent.items[k];
	if (last_block != null && last_block.str[last_block.str.length - 1] != ' ')
	if (block.x < last_block.x)
	page_text += "\r\n";
	else if (last_block.y != block.y && (last_block.str.match(/^(\s?[a-zA-Z])$\|^(.+\s[a-zA-Z])$/) == null))
	page_text += ' ';
	page_text += block.str;
	last_block = block;
	}

	textContent != null && console.log("page " + n + " finished."); //" content: \n" + page_text);
	layers[n] = page_text + "\n\n";
	}
	++self.complete;
	callbackPageDone(self.complete, total);
	if (self.complete == total) {
	window.setTimeout(function () {
	var full_text = "";
	var num_pages = Object.keys(layers).length;
	for (var j = 1; j <= num_pages; j++)
	full_text += layers[j];
	callbackAllDone(full_text);
	}, 1000);
	}
	}); // end of page.getTextContent().then
	}); // end of page.then
	} // of for
	});
	}; // end of pdfToText()
	}; // end of class


	let input = document.createElement('input')
	input.type = 'file'
	input.multiple = true
	input.accept = 'application/pdf'
	input.addEventListener('input', function () {
	for (const file of this.files) {
	let reader = new FileReader();
	reader.onload = function () {
	let arrayBuffer = this.result

	new Pdf2TextClass().pdfToText(arrayBuffer, (pagNum, tPagNum) => { console.log(`${pagNum}/${tPagNum}`); }, text => {
	let downloadLink = document.createElement("a");
	downloadLink.href = 'data:text/txt;charset=utf-8,' + text;;
	downloadLink.download = file.name.substr(0, file.name.lastIndexOf('.')) + ".txt";

	document.body.appendChild(downloadLink);
	downloadLink.click();
	downloadLink.remove()
	})
	}
	reader.readAsArrayBuffer(file);
	}
	})

	document.addEventListener("DOMContentLoaded", _ =>
	document.querySelector('body').append(input))