Skip to content

Instantly share code, notes, and snippets.

@jackbow
Last active May 12, 2020 23:49
Show Gist options
  • Save jackbow/4235f45284b17655a76b8a060876ce13 to your computer and use it in GitHub Desktop.
Save jackbow/4235f45284b17655a76b8a060876ce13 to your computer and use it in GitHub Desktop.
For turning the Costco inventory PDFs into CSVs with prices for easy ordering. To use, download this file & run `python3 costco.py` in your terminal in the same directory as the file.
import string
import re
import sys
print("Upload the Costco pdf to https://ocr.space/, and select table recognition option, OCR Engine1, and save the output text to a txt file in the same directory as this script.")
path = input("Next, enter the name of the file: ")
f = open(path, 'r')
def hasNumbers(inputString):
return any(char.isdigit() for char in inputString)
def hasAlpha(inputString):
return any(char.isalpha() for char in inputString)
def split_on_char(s):
match = re.compile("[^\W\d]").search(s)
return [s[:match.start()], s[match.start():]]
def split_on_digit(s):
match = re.compile("\d").search(s)
return [s[:match.start()], s[match.start():]]
def split_on_money(s):
match = re.compile("(\d+)?\.?\d+$").search(s)
return [s[:match.start()], s[match.start():]]
def split_on_newline(s):
match = re.compile("\\n").search(s)
return [s[:match.start()], s[match.start():]]
def splitInsert(listt, idx, opt):
if opt == 'c':
csplit = split_on_char
if opt == 'd':
csplit = split_on_digit
if opt == 'm':
csplit = split_on_money
if opt == 'n':
csplit = split_on_newline
spp = csplit(listt[idx])
listt[idx] = spp[0]
listt.insert(idx+1, spp[1])
return listt
lines = f.readlines()
nlines = []
for i,v in enumerate(lines):
if not ((not v[0].isdigit()) or
(v[0] == '*') or
(v[0:9].count('/', 0, 8) >= 2) or
(v.count('icc.coop') > 0)):
# split line
sp = v.split('\t')
# remove trailing spaces
for j in range(len(sp)):
if sp[j][-1] == ' ':
sp[j] = sp[j][0:-1]
contains_nums = [True, False, False, True, True, False]
contains_alph = [False, True, True, True, False, False]
# split bad data
if len(sp) != 6:
for j in range(len(sp)):
if (hasNumbers(sp[j]) != contains_nums[j] or
hasAlpha(sp[j]) != contains_alph[j]):
if j == 0:
sp = splitInsert(sp, j, 'c')
if j == 1:
sp = splitInsert(sp, j, 'd')
if j == 2:
if not sp[j][0].isdigit():
sp = splitInsert(sp, j, 'd')
elif sp[j][-1].isdigit():
sp = splitInsert(sp, j, 'm')
if j == 3:
if hasAlpha(sp[j]) and hasNumbers(sp[j]):
sp = splitInsert(sp, j, 'm')
elif sp[j][-1] == '\n':
sp = splitInsert(sp, j, 'n')
if len(sp[j]) > 0 and (sp[j][-1] == '\n'):
sp = splitInsert(sp, j, 'n')
# fill missing data
if len(sp) != 6:
for j in range(len(sp)):
if (hasNumbers(sp[j]) != contains_nums[j] or
hasAlpha(sp[j]) != contains_alph[j]):
sp.insert(j, ' ')
if len(sp) == 6:
break
# remove spaces in item # & price
sp[0] = sp[0].replace(' ', '')
sp[-2] = sp[-2].replace(' ', '')
nlines.append('\t'.join(sp))
# if index error, run this code, and remove the \t from the relevant part:
# `print(v)`
# `lines[i] = <the text with the extra \t removed>`
# eg.
# `print(v): '843142\tLOG CABIN\tSYRUP\t2/64 OUNCE CONTAINERS\t6\t.99\t\n'`
# `lines[i] = '843142\tLOG CABIN\tSYRUP\t2/64 OUNCE CONTAINERS\t6.99\t\n'`
file = open('costco_inventory.csv', 'w')
file.writelines(nlines)
print("Number of items found: " + str(len(nlines)))
print("Output saved to costco_inventory.csv")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment