Skip to content

Instantly share code, notes, and snippets.

@eduwass
Last active August 10, 2023 04:07
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save eduwass/1088102195dbc25c9d3b167c8c968d00 to your computer and use it in GitHub Desktop.
Save eduwass/1088102195dbc25c9d3b167c8c968d00 to your computer and use it in GitHub Desktop.
pdf to text using pdfjs
<!DOCTYPE html>
<html>
<head></head>
<body>
<input type="file" id="pdf-upload" accept=".pdf" />
<h1>Text will go here</h1>
<div class="pdf-text" id="pdf-text"></div>
<script src="https://mozilla.github.io/pdf.js/build/pdf.js"></script>
<script>
document
.getElementById("pdf-upload")
.addEventListener("change", function () {
var file = this.files[0];
if (file) {
var reader = new FileReader();
reader.onload = function (e) {
var pdfData = new Uint8Array(this.result);
extractText(pdfData);
};
reader.readAsArrayBuffer(file);
}
});
pdfjsLib.GlobalWorkerOptions.workerSrc =
"https://mozilla.github.io/pdf.js/build/pdf.worker.js";
async function appendTextWordByWord(text) {
const words = text.split(/\s+/);
for (const word of words) {
document.getElementById("pdf-text").innerHTML += word + " ";
await new Promise((resolve) => setTimeout(resolve, 100)); // 100-millisecond delay
}
}
function extractText(pdfData) {
var pdf = pdfjsLib.getDocument({ data: pdfData });
return pdf.promise.then(async function (pdf) {
var totalPageCount = pdf.numPages;
for (
var currentPage = 1;
currentPage <= totalPageCount;
currentPage++
) {
let page = await pdf.getPage(currentPage);
let textContent = await page.getTextContent();
let pageText = textContent.items
.map(function (s) {
return s.str;
})
.join("");
await appendTextWordByWord(pageText);
}
});
}
console.log("start");
</script>
</body>
</html>
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment