Skip to content

Instantly share code, notes, and snippets.

@M1ndBlast
Last active December 6, 2021 23:41
Show Gist options
  • Save M1ndBlast/8dc4ecf3fa684d1111350ca0139812fc to your computer and use it in GitHub Desktop.
Save M1ndBlast/8dc4ecf3fa684d1111350ca0139812fc to your computer and use it in GitHub Desktop.
PDF Decoder to Text
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8" />
<meta http-equiv="X-UA-Compatible" content="IE=edge" />
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
<title>PDF Decoder to Text</title>
<script src="https://cdn.jsdelivr.net/npm/pdfjs-dist@2.10.377/build/pdf.min.js"></script>
<script src="pdf2txt.multiple-1.0.js"></script>
</head>
<body></body>
</html>
let PDFJS = window['pdfjs-dist/build/pdf']; // Loaded via <script> tag, create shortcut to access PDF.js exports.
PDFJS.GlobalWorkerOptions.workerSrc = 'https://cdn.jsdelivr.net/npm/pdfjs-dist@2.10.377/build/pdf.worker.min.js'; // The workerSrc property shall be specified.
function Pdf2TextClass() {
var self = this;
this.complete = 0;
/**
* @param data ArrayBuffer of the pdf file content
* @param callbackPageDone To inform the progress each time
* when a page is finished. The callback function's input parameters are:
* 1) number of pages done;
* 2) total number of pages in file.
* @param callbackAllDone The input parameter of callback function is
* the result of extracted text from pdf file.
*/
this.pdfToText = function (data, callbackPageDone, callbackAllDone) {
console.assert(data instanceof ArrayBuffer || typeof data == 'string', 'No data type allowed');
PDFJS.getDocument(data).promise.then(function (pdf) {
var div = document.getElementById('viewer');
var total = pdf.numPages;
callbackPageDone(0, total);
var layers = {};
for (i = 1; i <= total; i++) {
pdf.getPage(i).then(function (page) {
var n = page.pageNumber;
page.getTextContent().then(function (textContent) {
if (null != textContent.items) {
var page_text = "";
var last_block = null;
for (var k = 0; k < textContent.items.length; k++) {
var block = textContent.items[k];
if (last_block != null && last_block.str[last_block.str.length - 1] != ' ')
if (block.x < last_block.x)
page_text += "\r\n";
else if (last_block.y != block.y && (last_block.str.match(/^(\s?[a-zA-Z])$|^(.+\s[a-zA-Z])$/) == null))
page_text += ' ';
page_text += block.str;
last_block = block;
}
textContent != null && console.log("page " + n + " finished."); //" content: \n" + page_text);
layers[n] = page_text + "\n\n";
}
++self.complete;
callbackPageDone(self.complete, total);
if (self.complete == total) {
window.setTimeout(function () {
var full_text = "";
var num_pages = Object.keys(layers).length;
for (var j = 1; j <= num_pages; j++)
full_text += layers[j];
callbackAllDone(full_text);
}, 1000);
}
}); // end of page.getTextContent().then
}); // end of page.then
} // of for
});
}; // end of pdfToText()
}; // end of class
let input = document.createElement('input')
input.type = 'file'
input.multiple = true
input.accept = 'application/pdf'
input.addEventListener('input', function () {
for (const file of this.files) {
let reader = new FileReader();
reader.onload = function () {
let arrayBuffer = this.result
new Pdf2TextClass().pdfToText(arrayBuffer, (pagNum, tPagNum) => { console.log(`${pagNum}/${tPagNum}`); }, text => {
let downloadLink = document.createElement("a");
downloadLink.href = 'data:text/txt;charset=utf-8,' + text;;
downloadLink.download = file.name.substr(0, file.name.lastIndexOf('.')) + ".txt";
document.body.appendChild(downloadLink);
downloadLink.click();
downloadLink.remove()
})
}
reader.readAsArrayBuffer(file);
}
})
document.addEventListener("DOMContentLoaded", _ =>
document.querySelector('body').append(input))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment