Skip to content

Instantly share code, notes, and snippets.

@KFDCompiled
Created November 3, 2017 16:36
Show Gist options
  • Save KFDCompiled/e2fd68df6652d7d35ed679b9d5651a3c to your computer and use it in GitHub Desktop.
Save KFDCompiled/e2fd68df6652d7d35ed679b9d5651a3c to your computer and use it in GitHub Desktop.
Second Attempt at Python Implementation of scal.pl
#import wget
import textract
# url = 'https://www.utcourts.gov/cal/data/SLC_Calendar.pdf'
# pdf = wget.download(url)
print "Processing pdf into text..."
pdf_text_raw = textract.process("SLC_Calendar.pdf") # Load text into list
print "Formatting..."
pdf_text_lines=pdf_text_raw.splitlines() # Convert '\n' into new lines
print "Structuring data..."
pdf_text_array=[line.split() for line in pdf_text_lines] # Load lines into 2D array
print "Searching for start of entries..."
row = 0
while row < len(pdf_text_array)-1:
col = 0
while col < len(pdf_text_array[row])-1:
if (( pdf_text_array[row][col] == "Page" ) and
( pdf_text_array[row+2][1] == "3RD" ) and
( pdf_text_array[row+3][1] == "BERNARDS-GOODMAN" ) and
( pdf_text_array[row+7][0] == "September" ) and
( pdf_text_array[row+7][1] == 29 ) and
( pdf_text_array[row+7][2] == 2017 )):
start = [row][col]
break
col = col+1
row = row+1
print "Found start position %" % start
while pos <= len(pdf_text_array) - 1:
if (( pdf_text_array[pos][1] == "Page" ) and
( pdf_text_array[pos+1][1] != "BERNARDS-GOODMAN" )):
end = pos
break
pos = pos+1
print "Found end position %" % end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment