library("tesseract") | |
library("magick") | |
library("magrittr") | |
# load 'em up | |
dest <- "/path/to/images" | |
myfiles <- list.files(path = dest, pattern = "jpg", full.names = TRUE) | |
# improve the images | |
# ocr 'em | |
# write the output to text file | |
lapply(myfiles, function(i){ | |
text <- image_read(i) %>% | |
image_resize("3000x") %>% | |
image_convert(type = 'Grayscale') %>% | |
image_trim(fuzz = 40) %>% | |
image_write(format = 'png', density = '300x300') %>% | |
tesseract::ocr() | |
outfile <- paste(i,"-ocr.txt",sep="") | |
cat(text, file=outfile, sep="\n") | |
}) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment