Skip to content

Instantly share code, notes, and snippets.

@davidcesarino
Created February 14, 2020 15:42
Show Gist options
  • Save davidcesarino/2e332766b33c05c71c16b7277ca7388e to your computer and use it in GitHub Desktop.
Save davidcesarino/2e332766b33c05c71c16b7277ca7388e to your computer and use it in GitHub Desktop.
Given a folder with uncompressed PDF files that are lab results from HULW patients, print text list of patient records and names, from each PDF.
# coding=utf-8
# Copyright 2020 David Cesarino de Sousa
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import pathlib
# Path for uncompressed PDF files. You can generate the uncompressed files with the following bash script:
#
# #!/bin/bash
# for f in ./*.PDF; do
# file=$(echo $f | cut -c 3-8)-deflated.pdf
# qpdf --stream-data=uncompress $f ./uncompressed/$file
# done
#
# Steps:
# 1. Store your retrieved PDFs from Lisweb in your work directory. Let's call it WORKING_DIRECTORY.
# 2. Place the bash script in WORKING_DIRECTORY.
# 3. Run the bash script.
# 4. Replace 'WORKING_DIRECTORY' in the variable below with the full path of your real working directory.
# 5. Run this Python script.
uncompressed_pdf_path = 'WORKING_DIRECTORY/uncompressed'
# DO NOT EDIT ANYTHING BELOW THIS LINE UNLESS YOU KNOW WHAT YOU ARE DOING.
exp_record_start = '(Prontu'
exp_record_last = '(o: ) Tj'
exp_name = '(ente: ) Tj'
match_field = 'Tj'
record_allowed = "0123456789/"
def grab_field(line):
return line[line.find("(") + 1:line.rfind(")")]
def pad_field(line, num):
remains = ' ' * (num - len(line))
return line + remains
def is_a_record(record):
return all(c in record_allowed for c in record)
def is_delimiter_enter_reached(match_str):
for match_try in exp_record_start:
if match_try in match_str:
return True
return False
def main():
# Get list of files.
flist = []
for p in pathlib.Path(uncompressed_pdf_path).iterdir():
if p.is_file():
flist.append(p)
# Read files.
for f in flist:
name = ''
data = ''
with open(f, 'rb') as file:
is_name_read = False
is_record_read = False
is_record_next = False
is_name_void = False
is_record_void = False
for newline in file:
if is_record_void and is_name_void:
break
line = newline.decode(errors='ignore')
if not is_name_void:
if not is_name_read and exp_name in line:
is_name_read = True
continue
elif is_name_read and match_field in line:
name = grab_field(line)
is_name_void = True
if not is_record_void:
if exp_record_start in line:
is_record_read = True
continue
elif not is_record_read:
continue
elif exp_record_last in line:
is_record_next = True
continue
elif is_record_next and match_field in line:
data = grab_field(line)
data = '' if data == 'Mdi' else data
is_record_void = True
# Now name contains the patient name, and data contains the record.
formatted = data if is_a_record(data) else '* (' + data + ')'
print(pad_field(formatted, 15) + '= [' + name + ']')
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment