Skip to content

Instantly share code, notes, and snippets.

@AlexandroMtzG
Last active February 5, 2023 23:48
Show Gist options
  • Save AlexandroMtzG/65e6ff82ed3b961c71391138f9648350 to your computer and use it in GitHub Desktop.
Save AlexandroMtzG/65e6ff82ed3b961c71391138f9648350 to your computer and use it in GitHub Desktop.
import Tesseract from "tesseract.js";
import PdfService from "./PdfService";
export const OcrTesseractLanguages = [
{ name: "English", value: "eng" },
{ name: "Spanish", value: "spa" },
];
async function scan(file: string, lang: string): Promise<string> {
return await new Promise(async (resolve, reject) => {
try {
if (file.endsWith(".pdf") || file.startsWith("data:application/pdf")) {
const images = await PdfService.convertToImages({ file });
console.log({ images });
let text = "";
for (const image of images) {
text += await scanImage(image.base64, lang);
}
// const text = await PdfService.convertToText(file);
resolve(text);
} else {
// OCR
const text = await scanImage(file, lang);
resolve(text);
}
} catch (e) {
// eslint-disable-next-line no-console
console.log(e);
reject(e);
}
});
}
async function scanImage(file: string, lang: string): Promise<string> {
// eslint-disable-next-line no-console
console.log("[OCR] Scanning image: ", file, lang);
return await new Promise(async (resolve, reject) => {
let image = file;
await Tesseract.recognize(image, lang, {
logger: (m) => {
// eslint-disable-next-line no-console
console.log("[OCR] Logger: ", JSON.stringify(m));
},
})
.then(({ data: { text } }) => {
// eslint-disable-next-line no-console
console.log("[OCR] Result: ", text);
resolve(text);
})
.catch((e) => {
// eslint-disable-next-line no-console
console.log("[OCR] Error: ", e);
reject(e);
});
});
}
export default {
scan,
};
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment