Skip to content

Instantly share code, notes, and snippets.

@charanhu
Created April 1, 2024 14:47
Show Gist options
  • Save charanhu/f2929c9bd1c25375e39ef98e543c1efc to your computer and use it in GitHub Desktop.
Save charanhu/f2929c9bd1c25375e39ef98e543c1efc to your computer and use it in GitHub Desktop.
<!DOCTYPE html>
<html>
<head>
<title>OCR PDFs and images directly in your browser</title>
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<script defer data-domain="tools.simonwillison.net" src="https://plausible.io/js/script.js"></script>
<script type="module">
import pdfjsDist from 'https://cdn.jsdelivr.net/npm/pdfjs-dist@4.0.379/+esm';
pdfjsLib.GlobalWorkerOptions.workerSrc =
"https://cdn.jsdelivr.net/npm/pdfjs-dist@4.0.379/build/pdf.worker.min.mjs";
</script>
<script src="https://cdn.jsdelivr.net/npm/tesseract.js@5/dist/tesseract.min.js"></script>
<style>
body {
padding: 1em;
font-family: helvetica, sans-serif;
line-height: 1.3;
}
.dropzone {
box-sizing: border-box;
width: 100%;
height: 10em;
border: 2px dashed #ccc;
display: flex;
justify-content: center;
align-items: center;
font-size: 24px;
cursor: pointer;
padding: 1em;
margin-bottom: 1em;
}
.dropzone.disabled {
cursor: not-allowed;
}
.dropzone.drag-over {
background-color: pink;
}
.image-container img {
margin-bottom: 10px;
max-width: 100%;
}
textarea {
width: 100%;
height: 10em;
margin-bottom: 20px;
box-sizing: border-box;
}
.full-document-section {
display: none;
margin-bottom: 20px;
}
</style>
</head>
<body>
<h1>OCR PDFs and images directly in your browser</h1>
<p>This tool runs entirely in your browser. No files are uploaded to a server.</p>
<p>It uses <a href="https://tesseract.projectnaptha.com/">Tesseract.js</a> for OCR and <a
href="https://mozilla.github.io/pdf.js/">PDF.js</a> to convert PDFs into images.</p>
<p><label>Language: <select id="id_language">
<option>ENG</option>
</select></label></p>
<input type="file" id="fileInput" accept=".pdf,.jpg,.jpeg,.png,.gif" style="display: none;" />
<div class="dropzone" id="dropzone">
Drag and drop a PDF, JPG, PNG, or GIF file here or click to select a file
</div>
<div class="full-document-section" id="fullDocumentSection">
<h2>Full document</h2>
<textarea class="full-document" id="fullDocument"></textarea>
<h2>Pages</h2>
</div>
<div class="image-container"></div>
<p><a href="https://github.com/simonw/tools/blob/main/ocr.html">Source code</a>. <a
href="https://simonwillison.net/2024/Mar/30/ocr-pdfs-images/">How I built this</a>.</p>
<script>
const desiredWidth = 1000;
const dropzone = document.getElementById('dropzone');
const fileInput = document.getElementById('fileInput');
const imageContainer = document.querySelector('.image-container');
const fullDocumentTextarea = document.getElementById('fullDocument');
const fullDocumentSection = document.getElementById('fullDocumentSection');
const languageSelect = document.getElementById('id_language');
let fileSelectionAllowed = true;
const LANGUAGES = {
"afr": "Afrikaans",
"amh": "Amharic",
"ara": "Arabic",
"asm": "Assamese",
"aze": "Azerbaijani",
"aze_cyrl": "Azerbaijani - Cyrillic",
"bel": "Belarusian",
"ben": "Bengali",
"bod": "Tibetan",
"bos": "Bosnian",
"bul": "Bulgarian",
"cat": "Catalan; Valencian",
"ceb": "Cebuano",
"ces": "Czech",
"chi_sim": "Chinese - Simplified",
"chi_tra": "Chinese - Traditional",
"chr": "Cherokee",
"cym": "Welsh",
"dan": "Danish",
"deu": "German",
"dzo": "Dzongkha",
"ell": "Greek, Modern (1453-)",
"eng": "English",
"enm": "English, Middle (1100-1500)",
"epo": "Esperanto",
"est": "Estonian",
"eus": "Basque",
"fas": "Persian",
"fin": "Finnish",
"fra": "French",
"frk": "German Fraktur",
"frm": "French, Middle (ca. 1400-1600)",
"gle": "Irish",
"glg": "Galician",
"grc": "Greek, Ancient (-1453)",
"guj": "Gujarati",
"hat": "Haitian; Haitian Creole",
"heb": "Hebrew",
"hin": "Hindi",
"hrv": "Croatian",
"hun": "Hungarian",
"iku": "Inuktitut",
"ind": "Indonesian",
"isl": "Icelandic",
"ita": "Italian",
"ita_old": "Italian - Old",
"jav": "Javanese",
"jpn": "Japanese",
"kan": "Kannada",
"kat": "Georgian",
"kat_old": "Georgian - Old",
"kaz": "Kazakh",
"khm": "Central Khmer",
"kir": "Kirghiz; Kyrgyz",
"kor": "Korean",
"kur": "Kurdish",
"lao": "Lao",
"lat": "Latin",
"lav": "Latvian",
"lit": "Lithuanian",
"mal": "Malayalam",
"mar": "Marathi",
"mkd": "Macedonian",
"mlt": "Maltese",
"msa": "Malay",
"mya": "Burmese",
"nep": "Nepali",
"nld": "Dutch; Flemish",
"nor": "Norwegian",
"ori": "Oriya",
"pan": "Panjabi; Punjabi",
"pol": "Polish",
"por": "Portuguese",
"pus": "Pushto; Pashto",
"ron": "Romanian; Moldavian; Moldovan",
"rus": "Russian",
"san": "Sanskrit",
"sin": "Sinhala; Sinhalese",
"slk": "Slovak",
"slv": "Slovenian",
"spa": "Spanish; Castilian",
"spa_old": "Spanish; Castilian - Old",
"sqi": "Albanian",
"srp": "Serbian",
"srp_latn": "Serbian - Latin",
"swa": "Swahili",
"swe": "Swedish",
"syr": "Syriac",
"tam": "Tamil",
"tel": "Telugu",
"tgk": "Tajik",
"tgl": "Tagalog",
"tha": "Thai",
"tir": "Tigrinya",
"tur": "Turkish",
"uig": "Uighur; Uyghur",
"ukr": "Ukrainian",
"urd": "Urdu",
"uzb": "Uzbek",
"uzb_cyrl": "Uzbek - Cyrillic",
"vie": "Vietnamese",
"yid": "Yiddish"
}
// Populate the languages select box
while (languageSelect.firstChild) {
languageSelect.removeChild(languageSelect.firstChild);
}
for (const code of Object.values(Tesseract.languages)) {
const name = LANGUAGES[code];
const option = document.createElement('option');
option.value = code;
option.textContent = name;
if (option.value == 'eng') {
option.selected = true;
}
languageSelect.appendChild(option);
}
function showFullDocument() {
// Only shows if there are multiple populated textareas
const populatedTextareas = Array.from(
document.querySelectorAll('.image-container textarea')
).filter(ta => ta.value.trim().length);
if (populatedTextareas.length > 1) {
fullDocumentTextarea.value = populatedTextareas.map(ta => ta.value.trim()).join("\n\n");
fullDocumentSection.style.display = 'block';
} else {
fullDocumentTextarea.value = '';
fullDocumentSection.style.display = 'none';
}
}
function setTextarea(ta, text) {
ta.value = text.trim();
// Set textarea height to fit content
ta.style.height = 'auto';
ta.style.height = (ta.scrollHeight + 5) + 'px';
}
dropzone.addEventListener('dragover', handleDragOver);
dropzone.addEventListener('dragleave', handleDragLeave);
dropzone.addEventListener('drop', handleDrop);
dropzone.addEventListener('click', handleClick);
async function handleDragOver(event) {
event.preventDefault();
if (fileSelectionAllowed) {
dropzone.classList.add('drag-over');
}
}
async function handleDragLeave(event) {
event.preventDefault();
if (fileSelectionAllowed) {
dropzone.classList.remove('drag-over');
}
}
async function handleDrop(event) {
event.preventDefault();
if (fileSelectionAllowed) {
dropzone.classList.remove('drag-over');
const file = event.dataTransfer.files[0];
fileInput.files = event.dataTransfer.files;
processFile(file);
}
}
async function handleClick() {
if (fileSelectionAllowed) {
fileInput.click();
}
}
fileInput.addEventListener('change', (event) => {
const file = event.target.files[0];
processFile(file);
});
async function processFile(file) {
const worker = await Tesseract.createWorker(languageSelect.value);
fullDocumentTextarea.value = '';
fullDocumentSection.style.display = 'none';
imageContainer.innerHTML = '';
const originalText = dropzone.innerText;
dropzone.innerText = 'Processing file...';
dropzone.classList.add('disabled');
fileSelectionAllowed = false;
if (file.type === 'application/pdf') {
const { numPages, imageIterator } = await convertPDFToImages(file);
let done = 0;
dropzone.innerText = `Processing ${numPages} page${numPages > 1 ? 's' : ''}`;
for await (const { imageURL } of imageIterator) {
const ta = await displayImage(imageURL);
const { text } = await ocrImage(worker, imageURL);
setTextarea(ta, text);
showFullDocument();
done += 1;
dropzone.innerText = `Done ${done} of ${numPages}`;
}
} else {
const imageURL = URL.createObjectURL(file);
const ta = await displayImage(imageURL);
const { text } = await ocrImage(worker, imageURL);
setTextarea(ta, text);
showFullDocument();
}
await worker.terminate();
dropzone.innerText = originalText;
dropzone.classList.remove('disabled');
fileSelectionAllowed = true;
}
async function displayImage(imageURL) {
const imgElement = document.createElement('img');
imgElement.src = imageURL;
imageContainer.appendChild(imgElement);
const altTextarea = document.createElement('textarea');
altTextarea.classList.add('textarea-alt');
altTextarea.placeholder = 'OCRing image...';
imageContainer.appendChild(altTextarea);
return altTextarea;
}
async function rerunOCR() {
const textareas = document.querySelectorAll('.image-container textarea');
const images = document.querySelectorAll('.image-container img');
if (!textareas.length) {
return;
}
// Blank all the textareas
Array.from(textareas).forEach(ta => ta.value = '');
showFullDocument();
const numImages = images.length;
let done = 0;
dropzone.innerText = `Processing ${numImages} image${numImages > 1 ? 's' : ''}`;
const worker = await Tesseract.createWorker(languageSelect.value);
for (let i = 0; i < numImages; i++) {
const imageURL = images[i].src;
const ta = textareas[i];
ta.placeholder = 'OCRing image...';
const { text } = await ocrImage(worker, imageURL);
setTextarea(ta, text);
showFullDocument();
done += 1;
dropzone.innerText = `Done ${done} of ${numImages}`;
}
await worker.terminate();
}
document.addEventListener('paste', (event) => {
const items = (event.clipboardData || event.originalEvent.clipboardData).items;
const images = Array.from(items).filter(item => item.type.indexOf('image') !== -1);
if (images.length) {
processFile(images[0].getAsFile());
}
});
async function convertPDFToImages(file) {
// returns { numPages, imageIterator }
const pdf = await pdfjsLib.getDocument(URL.createObjectURL(file)).promise;
const numPages = pdf.numPages;
async function* images() {
for (let i = 1; i <= numPages; i++) {
try {
const page = await pdf.getPage(i);
const viewport = page.getViewport({ scale: 1 });
const canvas = document.createElement('canvas');
const context = canvas.getContext('2d');
canvas.width = desiredWidth;
canvas.height = (desiredWidth / viewport.width) * viewport.height;
const renderContext = {
canvasContext: context,
viewport: page.getViewport({ scale: desiredWidth / viewport.width }),
};
await page.render(renderContext).promise;
const imageURL = canvas.toDataURL('image/jpeg', 0.8);
yield { imageURL };
} catch (error) {
console.error(`Error rendering page ${i}:`, error);
}
}
}
return { numPages: numPages, imageIterator: images() };
}
async function ocrImage(worker, imageUrl) {
const {
data: { text },
} = await worker.recognize(imageUrl);
return { text };
}
// Update URL bar to match language select
languageSelect.addEventListener('change', async (event) => {
let newUrl = window.location.pathname;
let language = event.target.value;
if (language != 'eng') {
newUrl += '?language=' + language;
}
window.history.pushState({ path: newUrl }, '', newUrl);
await rerunOCR();
});
function setLanguageFromQueryString() {
const params = new URLSearchParams(window.location.search);
let value = params.get('language');
if (!value) {
value = 'eng';
}
languageSelect.value = value;
}
// Set the select box value when the page loads
window.addEventListener('load', setLanguageFromQueryString);
window.addEventListener('popstate', (event) => {
setLanguageFromQueryString();
});
</script>
</body>
</html>
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment