Created
April 1, 2024 14:47
-
-
Save charanhu/f2929c9bd1c25375e39ef98e543c1efc to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<!DOCTYPE html> | |
<html> | |
<head> | |
<title>OCR PDFs and images directly in your browser</title> | |
<meta name="viewport" content="width=device-width, initial-scale=1.0"> | |
<script defer data-domain="tools.simonwillison.net" src="https://plausible.io/js/script.js"></script> | |
<script type="module"> | |
import pdfjsDist from 'https://cdn.jsdelivr.net/npm/pdfjs-dist@4.0.379/+esm'; | |
pdfjsLib.GlobalWorkerOptions.workerSrc = | |
"https://cdn.jsdelivr.net/npm/pdfjs-dist@4.0.379/build/pdf.worker.min.mjs"; | |
</script> | |
<script src="https://cdn.jsdelivr.net/npm/tesseract.js@5/dist/tesseract.min.js"></script> | |
<style> | |
body { | |
padding: 1em; | |
font-family: helvetica, sans-serif; | |
line-height: 1.3; | |
} | |
.dropzone { | |
box-sizing: border-box; | |
width: 100%; | |
height: 10em; | |
border: 2px dashed #ccc; | |
display: flex; | |
justify-content: center; | |
align-items: center; | |
font-size: 24px; | |
cursor: pointer; | |
padding: 1em; | |
margin-bottom: 1em; | |
} | |
.dropzone.disabled { | |
cursor: not-allowed; | |
} | |
.dropzone.drag-over { | |
background-color: pink; | |
} | |
.image-container img { | |
margin-bottom: 10px; | |
max-width: 100%; | |
} | |
textarea { | |
width: 100%; | |
height: 10em; | |
margin-bottom: 20px; | |
box-sizing: border-box; | |
} | |
.full-document-section { | |
display: none; | |
margin-bottom: 20px; | |
} | |
</style> | |
</head> | |
<body> | |
<h1>OCR PDFs and images directly in your browser</h1> | |
<p>This tool runs entirely in your browser. No files are uploaded to a server.</p> | |
<p>It uses <a href="https://tesseract.projectnaptha.com/">Tesseract.js</a> for OCR and <a | |
href="https://mozilla.github.io/pdf.js/">PDF.js</a> to convert PDFs into images.</p> | |
<p><label>Language: <select id="id_language"> | |
<option>ENG</option> | |
</select></label></p> | |
<input type="file" id="fileInput" accept=".pdf,.jpg,.jpeg,.png,.gif" style="display: none;" /> | |
<div class="dropzone" id="dropzone"> | |
Drag and drop a PDF, JPG, PNG, or GIF file here or click to select a file | |
</div> | |
<div class="full-document-section" id="fullDocumentSection"> | |
<h2>Full document</h2> | |
<textarea class="full-document" id="fullDocument"></textarea> | |
<h2>Pages</h2> | |
</div> | |
<div class="image-container"></div> | |
<p><a href="https://github.com/simonw/tools/blob/main/ocr.html">Source code</a>. <a | |
href="https://simonwillison.net/2024/Mar/30/ocr-pdfs-images/">How I built this</a>.</p> | |
<script> | |
const desiredWidth = 1000; | |
const dropzone = document.getElementById('dropzone'); | |
const fileInput = document.getElementById('fileInput'); | |
const imageContainer = document.querySelector('.image-container'); | |
const fullDocumentTextarea = document.getElementById('fullDocument'); | |
const fullDocumentSection = document.getElementById('fullDocumentSection'); | |
const languageSelect = document.getElementById('id_language'); | |
let fileSelectionAllowed = true; | |
const LANGUAGES = { | |
"afr": "Afrikaans", | |
"amh": "Amharic", | |
"ara": "Arabic", | |
"asm": "Assamese", | |
"aze": "Azerbaijani", | |
"aze_cyrl": "Azerbaijani - Cyrillic", | |
"bel": "Belarusian", | |
"ben": "Bengali", | |
"bod": "Tibetan", | |
"bos": "Bosnian", | |
"bul": "Bulgarian", | |
"cat": "Catalan; Valencian", | |
"ceb": "Cebuano", | |
"ces": "Czech", | |
"chi_sim": "Chinese - Simplified", | |
"chi_tra": "Chinese - Traditional", | |
"chr": "Cherokee", | |
"cym": "Welsh", | |
"dan": "Danish", | |
"deu": "German", | |
"dzo": "Dzongkha", | |
"ell": "Greek, Modern (1453-)", | |
"eng": "English", | |
"enm": "English, Middle (1100-1500)", | |
"epo": "Esperanto", | |
"est": "Estonian", | |
"eus": "Basque", | |
"fas": "Persian", | |
"fin": "Finnish", | |
"fra": "French", | |
"frk": "German Fraktur", | |
"frm": "French, Middle (ca. 1400-1600)", | |
"gle": "Irish", | |
"glg": "Galician", | |
"grc": "Greek, Ancient (-1453)", | |
"guj": "Gujarati", | |
"hat": "Haitian; Haitian Creole", | |
"heb": "Hebrew", | |
"hin": "Hindi", | |
"hrv": "Croatian", | |
"hun": "Hungarian", | |
"iku": "Inuktitut", | |
"ind": "Indonesian", | |
"isl": "Icelandic", | |
"ita": "Italian", | |
"ita_old": "Italian - Old", | |
"jav": "Javanese", | |
"jpn": "Japanese", | |
"kan": "Kannada", | |
"kat": "Georgian", | |
"kat_old": "Georgian - Old", | |
"kaz": "Kazakh", | |
"khm": "Central Khmer", | |
"kir": "Kirghiz; Kyrgyz", | |
"kor": "Korean", | |
"kur": "Kurdish", | |
"lao": "Lao", | |
"lat": "Latin", | |
"lav": "Latvian", | |
"lit": "Lithuanian", | |
"mal": "Malayalam", | |
"mar": "Marathi", | |
"mkd": "Macedonian", | |
"mlt": "Maltese", | |
"msa": "Malay", | |
"mya": "Burmese", | |
"nep": "Nepali", | |
"nld": "Dutch; Flemish", | |
"nor": "Norwegian", | |
"ori": "Oriya", | |
"pan": "Panjabi; Punjabi", | |
"pol": "Polish", | |
"por": "Portuguese", | |
"pus": "Pushto; Pashto", | |
"ron": "Romanian; Moldavian; Moldovan", | |
"rus": "Russian", | |
"san": "Sanskrit", | |
"sin": "Sinhala; Sinhalese", | |
"slk": "Slovak", | |
"slv": "Slovenian", | |
"spa": "Spanish; Castilian", | |
"spa_old": "Spanish; Castilian - Old", | |
"sqi": "Albanian", | |
"srp": "Serbian", | |
"srp_latn": "Serbian - Latin", | |
"swa": "Swahili", | |
"swe": "Swedish", | |
"syr": "Syriac", | |
"tam": "Tamil", | |
"tel": "Telugu", | |
"tgk": "Tajik", | |
"tgl": "Tagalog", | |
"tha": "Thai", | |
"tir": "Tigrinya", | |
"tur": "Turkish", | |
"uig": "Uighur; Uyghur", | |
"ukr": "Ukrainian", | |
"urd": "Urdu", | |
"uzb": "Uzbek", | |
"uzb_cyrl": "Uzbek - Cyrillic", | |
"vie": "Vietnamese", | |
"yid": "Yiddish" | |
} | |
// Populate the languages select box | |
while (languageSelect.firstChild) { | |
languageSelect.removeChild(languageSelect.firstChild); | |
} | |
for (const code of Object.values(Tesseract.languages)) { | |
const name = LANGUAGES[code]; | |
const option = document.createElement('option'); | |
option.value = code; | |
option.textContent = name; | |
if (option.value == 'eng') { | |
option.selected = true; | |
} | |
languageSelect.appendChild(option); | |
} | |
function showFullDocument() { | |
// Only shows if there are multiple populated textareas | |
const populatedTextareas = Array.from( | |
document.querySelectorAll('.image-container textarea') | |
).filter(ta => ta.value.trim().length); | |
if (populatedTextareas.length > 1) { | |
fullDocumentTextarea.value = populatedTextareas.map(ta => ta.value.trim()).join("\n\n"); | |
fullDocumentSection.style.display = 'block'; | |
} else { | |
fullDocumentTextarea.value = ''; | |
fullDocumentSection.style.display = 'none'; | |
} | |
} | |
function setTextarea(ta, text) { | |
ta.value = text.trim(); | |
// Set textarea height to fit content | |
ta.style.height = 'auto'; | |
ta.style.height = (ta.scrollHeight + 5) + 'px'; | |
} | |
dropzone.addEventListener('dragover', handleDragOver); | |
dropzone.addEventListener('dragleave', handleDragLeave); | |
dropzone.addEventListener('drop', handleDrop); | |
dropzone.addEventListener('click', handleClick); | |
async function handleDragOver(event) { | |
event.preventDefault(); | |
if (fileSelectionAllowed) { | |
dropzone.classList.add('drag-over'); | |
} | |
} | |
async function handleDragLeave(event) { | |
event.preventDefault(); | |
if (fileSelectionAllowed) { | |
dropzone.classList.remove('drag-over'); | |
} | |
} | |
async function handleDrop(event) { | |
event.preventDefault(); | |
if (fileSelectionAllowed) { | |
dropzone.classList.remove('drag-over'); | |
const file = event.dataTransfer.files[0]; | |
fileInput.files = event.dataTransfer.files; | |
processFile(file); | |
} | |
} | |
async function handleClick() { | |
if (fileSelectionAllowed) { | |
fileInput.click(); | |
} | |
} | |
fileInput.addEventListener('change', (event) => { | |
const file = event.target.files[0]; | |
processFile(file); | |
}); | |
async function processFile(file) { | |
const worker = await Tesseract.createWorker(languageSelect.value); | |
fullDocumentTextarea.value = ''; | |
fullDocumentSection.style.display = 'none'; | |
imageContainer.innerHTML = ''; | |
const originalText = dropzone.innerText; | |
dropzone.innerText = 'Processing file...'; | |
dropzone.classList.add('disabled'); | |
fileSelectionAllowed = false; | |
if (file.type === 'application/pdf') { | |
const { numPages, imageIterator } = await convertPDFToImages(file); | |
let done = 0; | |
dropzone.innerText = `Processing ${numPages} page${numPages > 1 ? 's' : ''}`; | |
for await (const { imageURL } of imageIterator) { | |
const ta = await displayImage(imageURL); | |
const { text } = await ocrImage(worker, imageURL); | |
setTextarea(ta, text); | |
showFullDocument(); | |
done += 1; | |
dropzone.innerText = `Done ${done} of ${numPages}`; | |
} | |
} else { | |
const imageURL = URL.createObjectURL(file); | |
const ta = await displayImage(imageURL); | |
const { text } = await ocrImage(worker, imageURL); | |
setTextarea(ta, text); | |
showFullDocument(); | |
} | |
await worker.terminate(); | |
dropzone.innerText = originalText; | |
dropzone.classList.remove('disabled'); | |
fileSelectionAllowed = true; | |
} | |
async function displayImage(imageURL) { | |
const imgElement = document.createElement('img'); | |
imgElement.src = imageURL; | |
imageContainer.appendChild(imgElement); | |
const altTextarea = document.createElement('textarea'); | |
altTextarea.classList.add('textarea-alt'); | |
altTextarea.placeholder = 'OCRing image...'; | |
imageContainer.appendChild(altTextarea); | |
return altTextarea; | |
} | |
async function rerunOCR() { | |
const textareas = document.querySelectorAll('.image-container textarea'); | |
const images = document.querySelectorAll('.image-container img'); | |
if (!textareas.length) { | |
return; | |
} | |
// Blank all the textareas | |
Array.from(textareas).forEach(ta => ta.value = ''); | |
showFullDocument(); | |
const numImages = images.length; | |
let done = 0; | |
dropzone.innerText = `Processing ${numImages} image${numImages > 1 ? 's' : ''}`; | |
const worker = await Tesseract.createWorker(languageSelect.value); | |
for (let i = 0; i < numImages; i++) { | |
const imageURL = images[i].src; | |
const ta = textareas[i]; | |
ta.placeholder = 'OCRing image...'; | |
const { text } = await ocrImage(worker, imageURL); | |
setTextarea(ta, text); | |
showFullDocument(); | |
done += 1; | |
dropzone.innerText = `Done ${done} of ${numImages}`; | |
} | |
await worker.terminate(); | |
} | |
document.addEventListener('paste', (event) => { | |
const items = (event.clipboardData || event.originalEvent.clipboardData).items; | |
const images = Array.from(items).filter(item => item.type.indexOf('image') !== -1); | |
if (images.length) { | |
processFile(images[0].getAsFile()); | |
} | |
}); | |
async function convertPDFToImages(file) { | |
// returns { numPages, imageIterator } | |
const pdf = await pdfjsLib.getDocument(URL.createObjectURL(file)).promise; | |
const numPages = pdf.numPages; | |
async function* images() { | |
for (let i = 1; i <= numPages; i++) { | |
try { | |
const page = await pdf.getPage(i); | |
const viewport = page.getViewport({ scale: 1 }); | |
const canvas = document.createElement('canvas'); | |
const context = canvas.getContext('2d'); | |
canvas.width = desiredWidth; | |
canvas.height = (desiredWidth / viewport.width) * viewport.height; | |
const renderContext = { | |
canvasContext: context, | |
viewport: page.getViewport({ scale: desiredWidth / viewport.width }), | |
}; | |
await page.render(renderContext).promise; | |
const imageURL = canvas.toDataURL('image/jpeg', 0.8); | |
yield { imageURL }; | |
} catch (error) { | |
console.error(`Error rendering page ${i}:`, error); | |
} | |
} | |
} | |
return { numPages: numPages, imageIterator: images() }; | |
} | |
async function ocrImage(worker, imageUrl) { | |
const { | |
data: { text }, | |
} = await worker.recognize(imageUrl); | |
return { text }; | |
} | |
// Update URL bar to match language select | |
languageSelect.addEventListener('change', async (event) => { | |
let newUrl = window.location.pathname; | |
let language = event.target.value; | |
if (language != 'eng') { | |
newUrl += '?language=' + language; | |
} | |
window.history.pushState({ path: newUrl }, '', newUrl); | |
await rerunOCR(); | |
}); | |
function setLanguageFromQueryString() { | |
const params = new URLSearchParams(window.location.search); | |
let value = params.get('language'); | |
if (!value) { | |
value = 'eng'; | |
} | |
languageSelect.value = value; | |
} | |
// Set the select box value when the page loads | |
window.addEventListener('load', setLanguageFromQueryString); | |
window.addEventListener('popstate', (event) => { | |
setLanguageFromQueryString(); | |
}); | |
</script> | |
</body> | |
</html> |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment