Skip to content

Instantly share code, notes, and snippets.

@KevinDanikowski
Created June 17, 2021 21:22
Show Gist options
  • Save KevinDanikowski/25cdcdda2ef4750bcf443f2027cc375a to your computer and use it in GitHub Desktop.
Save KevinDanikowski/25cdcdda2ef4750bcf443f2027cc375a to your computer and use it in GitHub Desktop.
useTesseract Hook
import { useState, useEffect } from 'react'
import { createWorker } from 'tesseract.js'
export default function useTesseract({ tesseractLanguage = 'eng', log = false }) {
const [tesseractWorker, setTesseractWorker] = useState(null)
const [loadingModel, setLoadingModel] = useState(true)
const [modelError, setModelError] = useState(false)
const [imgResults, setImgResults] = useState({})
const [processing, setProcessing] = useState(false)
const [progress, setProgress] = useState(0)
const extractTextFromImage = (imageUrl) => {
const recognize = async () => {
const {
data: {
hocr: htmlOutput,
text,
// tsv, box, unlv
},
} = await tesseractWorker.recognize(imageUrl)
setProcessing(false)
setImgResults({ html: htmlOutput, text })
}
if (loadingModel) {
try {
setTimeout(recognize, 400)
} catch (e) {
console.error('Timeout Error:', e.message)
setImgResults({ error: true })
}
} else {
try {
setProcessing(true)
recognize()
} catch (e) {
console.error('Tesseract Error:', e.message)
setProcessing(false)
setImgResults({ error: true })
}
}
}
const logger = (m) => {
setProgress(m.progress)
if (log) {
console.info(m)
}
}
useEffect(() => {
const loadTesseract = async () => {
if (tesseractWorker) {
await tesseractWorker.loadLanguage(tesseractLanguage)
await tesseractWorker.initialize(tesseractLanguage)
console.info(`INFO: loaded ${tesseractLanguage} tesseract model`)
} else {
const tesseractWorker = createWorker({
logger,
// specify paths because sometimes the free CDN goes down
// corePath: '/static/tesseract-core.wasm.2.2.0.js',
// workerPath: '/static/tesseract-worker.v2.1.4.min.js',
})
setTesseractWorker(tesseractWorker)
await tesseractWorker.load()
await tesseractWorker.loadLanguage(tesseractLanguage)
await tesseractWorker.initialize(tesseractLanguage)
console.info(`INFO: loaded ${tesseractLanguage} tesseract model`)
setLoadingModel(false)
setModelError(true)
setLoadingModel(false)
}
}
loadTesseract().catch((e) => {
console.error(`ERROR: Failed to load tesseract model`, e.message)
setModelError(true)
setLoadingModel(false)
})
// TODO: Have to add a ref to reference the latest tesseractWorker in order to terminate
// return () => tesseractWorker.terminate()
}, [tesseractLanguage])
return {
imgResults,
loadingModel,
processing,
modelError,
progress,
extractTextFromImage,
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment