Skip to content

Instantly share code, notes, and snippets.

@kyrre
Created March 19, 2015 17:22
Show Gist options
  • Save kyrre/c19f94f8662194be1c3d to your computer and use it in GitHub Desktop.
Save kyrre/c19f94f8662194be1c3d to your computer and use it in GitHub Desktop.
convert pdf files to text
def pdf_to_str(filename, resolution=600):
try:
with wand.image.Image(filename=filename, resolution=resolution) as img:
img.type = 'grayscale'
img.format = 'jpeg'
img.compression_quality = 100
directory = tempfile.mkdtemp()
output_path = os.path.join(directory, filename[:-4] + '.jpg')
with img.convert('jpeg') as converted:
converted.save(filename=output_path)
text = ocr_directory(directory)
shutil.rmtree(directory)
return text
except:
return None
def img_to_str(img_path):
try:
return pytesseract.image_to_string(Image.open(img_path))
except:
return None
def ocr_directory(directory):
filenames = glob.glob(os.path.join(directory, '*'))
return '\n'.join((img_to_str(image) for image in filenames))
print pdf_to_str('all.pdf')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment