Skip to content

Instantly share code, notes, and snippets.

@hypernova7
Last active October 16, 2022 21:52
Show Gist options
  • Save hypernova7/9f3586a681c5aa6fc404ccb909f03f7b to your computer and use it in GitHub Desktop.
Save hypernova7/9f3586a681c5aa6fc404ccb909f03f7b to your computer and use it in GitHub Desktop.
Telegram OCR Bot
import { Telegraf } from 'telegraf'
import { getTextFromImage, langs } from './ocr'
const bot = new Telegraf(process.env.BOT_TOKEN)
bot.command('ocr', async ctx => {
const reply_to = ctx.message.reply_to_message
const query = ctx.message.text.replce('/ocr', '').trim()
if (query === 'langs') {
let list = '';
for (const lng in langs) {
list += `\u2014 <b>${lng}</b>: ${langs[lng]}\n`;
}
return ctx.replyWithHTML(`List of available languages:\n\n${list}`)
}
if (!langs[query]) {
return ctx.replyWithHTML(`<b>Language not found</b>\nSend <code>/ocr langs</code> to see a list of available languages`);
}
if (reply_to && reply_to.photo) { // check if the replied message contains an image or photo
const { file_id } = reply_to.photo.pop(); // get the best image quality
const url = await ctx.telegram.getFileLink(file_id); // get file link to download in memory
const text = await getTextFromImage(ctx.from.language_code, url); // get text from image
// send reply with image text
ctx.reply(text, {
reply_to_message_id: reply_to.message_id,
allow_sending_without_reply: true
})
}
})
bot.launch() // start bot
import { resolve } from 'node:path';
import { readFile } from 'node:fs/promises';
import { createOCRClient } from 'tesseract-wasm/node';
import sharp from 'sharp';
import got from 'got';
// list of available languages
export const langs = {
afr: 'Afrikaans',
amh: 'Amharic',
ara: 'Arabic',
asm: 'Assamese',
aze: 'Azerbaijani',
aze_cyrl: 'Azerbaijani - Cyrilic',
bel: 'Belarusian',
ben: 'Bengali',
bod: 'Tibetan',
bos: 'Bosnian',
bre: 'Breton',
bul: 'Bulgarian',
cat: 'Catalan; Valencian',
ceb: 'Cebuano',
ces: 'Czech',
chi_sim: 'Chinese - Simplified',
chi_tra: 'Chinese - Traditional',
chr: 'Cherokee',
cos: 'Corsican',
cym: 'Welsh',
dan: 'Danish',
dan_frak: 'Danish - Fraktur (contrib)',
deu: 'German',
deu_frak: 'German - Fraktur (contrib)',
dzo: 'Dzongkha',
ell: 'Greek, Modern (1453-)',
eng: 'English',
enm: 'English, Middle (1100-1500)',
epo: 'Esperanto',
equ: 'Math / equation detection module',
est: 'Estonian',
eus: 'Basque',
fao: 'Faroese',
fas: 'Persian',
fil: 'Filipino (old - Tagalog)',
fin: 'Finnish',
fra: 'French',
frk: 'German - Fraktur',
frm: 'French, Middle (ca.1400-1600)',
fry: 'Western Frisian',
gla: 'Scottish Gaelic',
gle: 'Irish',
glg: 'Galician',
grc: 'Greek, Ancient (to 1453) (contrib)',
guj: 'Gujarati',
hat: 'Haitian; Haitian Creole',
heb: 'Hebrew',
hin: 'Hindi',
hrv: 'Croatian',
hun: 'Hungarian',
hye: 'Armenian',
iku: 'Inuktitut',
ind: 'Indonesian',
isl: 'Icelandic',
ita: 'Italian',
ita_old: 'Italian - Old',
jav: 'Javanese',
jpn: 'Japanese',
kan: 'Kannada',
kat: 'Georgian',
kat_old: 'Georgian - Old',
kaz: 'Kazakh',
khm: 'Central Khmer',
kir: 'Kirghiz; Kyrgyz',
kmr: 'Kurmanji (Kurdish - Latin Script)',
kor: 'Korean',
kor_vert: 'Korean (vertical)',
kur: 'Kurdish (Arabic Script)',
lao: 'Lao',
lat: 'Latin',
lav: 'Latvian',
lit: 'Lithuanian',
ltz: 'Luxembourgish',
mal: 'Malayalam',
mar: 'Marathi',
mkd: 'Macedonian',
mlt: 'Maltese',
mon: 'Mongolian',
mri: 'Maori',
msa: 'Malay',
mya: 'Burmese',
nep: 'Nepali',
nld: 'Dutch; Flemish',
nor: 'Norwegian',
oci: 'Occitan (post 1500)',
ori: 'Oriya',
osd: 'Orientation and script detection module',
pan: 'Panjabi; Punjabi',
pol: 'Polish',
por: 'Portuguese',
pus: 'Pushto; Pashto',
que: 'Quechua',
ron: 'Romanian; Moldavian; Moldovan',
rus: 'Russian',
san: 'Sanskrit',
sin: 'Sinhala; Sinhalese',
slk: 'Slovak',
slk_frak: 'Slovak - Fraktur (contrib)',
slv: 'Slovenian',
snd: 'Sindhi',
spa: 'Spanish; Castilian',
spa_old: 'Spanish; Castilian - Old',
sqi: 'Albanian',
srp: 'Serbian',
srp_latn: 'Serbian - Latin',
sun: 'Sundanese',
swa: 'Swahili',
swe: 'Swedish',
syr: 'Syriac',
tam: 'Tamil',
tat: 'Tatar',
tel: 'Telugu',
tgk: 'Tajik',
tgl: 'Tagalog (new - Filipino)',
tha: 'Thai',
tir: 'Tigrinya',
ton: 'Tonga',
tur: 'Turkish',
uig: 'Uighur; Uyghur',
ukr: 'Ukrainian',
urd: 'Urdu',
uzb: 'Uzbek',
uzb_cyrl: 'Uzbek - Cyrilic',
vie: 'Vietnamese',
yid: 'Yiddish',
yor: 'Yoruba'
};
// create abbreviation list of languages
const abbrLangs = {};
for (const lang in langs) {
abbrLangs[lang] = lang;
}
export async function getTextFromImage (lang = 'eng', image) {
lang = lang && lang.length > 0 ? abbrLangs[lang] || 'eng' : 'eng';
let text;
const client = createOCRClient(); // initialize tesseract
const modelURL = `https://github.com/tesseract-ocr/tessdata_fast/raw/main/${lang}.traineddata`;
try {
const model = await got(modelURL).buffer(); // get model
// check if a url or local file and get buffer
const buffer = await (image.startsWith('http')
? got(image).buffer()
: readFile(resolve(__dirname, image)));
const img = await sharp(buffer).ensureAlpha(); // improve image quality fro better readability
const { width, height } = await img.metadata();
const data = await img.raw().toBuffer();
await client.loadModel(model); // load model
// load image
await client.loadImage({
data,
width,
height
});
text = await client.getText(); // get text from image
} catch (e) {
} finally {
client.destroy(); // destroy tesseract process
}
return text;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment