Skip to content

Instantly share code, notes, and snippets.

@bitnik
Last active February 22, 2023 10:28
Show Gist options
  • Star 10 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save bitnik/0f1067267d28b4efb08988b121696c14 to your computer and use it in GitHub Desktop.
Save bitnik/0f1067267d28b4efb08988b121696c14 to your computer and use it in GitHub Desktop.
Scan pdfs and rename them according to data in barcodes they contain.
import tempfile
import glob
import argparse
# import xlrd
import pandas
from os.path import join, basename
from shutil import copyfile
from pdf2image import convert_from_path # , convert_from_bytes
from pyzbar.pyzbar import decode
# from PIL import Image
def cihan(input_folder, excel_file, output_folder):
summary = []
# load excel file
df = pandas.read_excel(excel_file)
df = df[['TaskId', 'ElementCode', 'DocName']]
for file in glob.glob(join(input_folder, '*.pdf')):
# convert pdf file to png
# scan the image and get task ids
file_name = basename(file)
with tempfile.TemporaryDirectory() as path:
images_from_path = convert_from_path(file, output_folder=path)
task_ids = []
for image in images_from_path:
# decode(Image.open('pyzbar/tests/code128.png'))
decoded = decode(image)
for d in decoded:
print(file_name, d.type, d.data.decode("utf-8"))
task_ids.append(int(d.data))
# compute new name from excel file by using task ids
new_name = []
for task_id in task_ids:
i = df.index[df['TaskId'] == task_id].tolist()
if len(i) > 1:
raise Exception('TaskId {} is multiple times in excel file!'.format(task_id))
new_name.append('{}_{}'.format(df['ElementCode'][i[0]], df['DocName'][i[0]]))
new_name = '{}.pdf'.format(','.join(new_name))
print('new name: ', new_name)
copyfile(file, join(output_folder, new_name))
summary.append('{} {}\n'.format(basename(file), new_name))
with open(join(output_folder, 'summary.txt'), 'w') as f:
f.writelines(summary)
def get_args():
parser = argparse.ArgumentParser(description='Scan pdfs and rename them according to '
'data in barcodes they contain.'
'\nTested on python 3.6.'
'\nRequirements: pip install pdf2image pyzbar image pandas xlrd')
parser.add_argument('-i', '--input_folder', required=True, help='Where all pdf files take place.')
parser.add_argument('-e', '--excel_file', required=True, help='Path of excel file.')
parser.add_argument('-o', '--output_folder', required=True, help='Where to copy new renamed pdf files.')
args = parser.parse_args()
return args
if __name__ == '__main__':
# ex call:
# python cihan.py -i '/home/kenan/PycharmProjects/cihan/pdfs' -e '/home/kenan/PycharmProjects/cihan/Book2.xlsx' -o '/home/kenan/PycharmProjects/cihan/output'
args = get_args()
cihan(args.input_folder, args.excel_file, args.output_folder)
pdf2image==0.1.11
pyzbar==0.1.7
image==1.5.24
pandas==0.23.0
xlrd==1.1.0
@bitnik
Copy link
Author

bitnik commented Nov 6, 2022

hi @Emin828 I wrote this script long time ago for a friend and never used again. I don't have any example input files anymore neither. So for me it is really hard to test the script right now.

Can you please write the full command and the full log output?

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment