Created
February 14, 2020 15:42
-
-
Save davidcesarino/2e332766b33c05c71c16b7277ca7388e to your computer and use it in GitHub Desktop.
Given a folder with uncompressed PDF files that are lab results from HULW patients, print text list of patient records and names, from each PDF.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# coding=utf-8 | |
# Copyright 2020 David Cesarino de Sousa | |
# | |
# Licensed under the Apache License, Version 2.0 (the "License"); | |
# you may not use this file except in compliance with the License. | |
# You may obtain a copy of the License at | |
# | |
# http://www.apache.org/licenses/LICENSE-2.0 | |
# | |
# Unless required by applicable law or agreed to in writing, software | |
# distributed under the License is distributed on an "AS IS" BASIS, | |
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
# See the License for the specific language governing permissions and | |
# limitations under the License. | |
import pathlib | |
# Path for uncompressed PDF files. You can generate the uncompressed files with the following bash script: | |
# | |
# #!/bin/bash | |
# for f in ./*.PDF; do | |
# file=$(echo $f | cut -c 3-8)-deflated.pdf | |
# qpdf --stream-data=uncompress $f ./uncompressed/$file | |
# done | |
# | |
# Steps: | |
# 1. Store your retrieved PDFs from Lisweb in your work directory. Let's call it WORKING_DIRECTORY. | |
# 2. Place the bash script in WORKING_DIRECTORY. | |
# 3. Run the bash script. | |
# 4. Replace 'WORKING_DIRECTORY' in the variable below with the full path of your real working directory. | |
# 5. Run this Python script. | |
uncompressed_pdf_path = 'WORKING_DIRECTORY/uncompressed' | |
# DO NOT EDIT ANYTHING BELOW THIS LINE UNLESS YOU KNOW WHAT YOU ARE DOING. | |
exp_record_start = '(Prontu' | |
exp_record_last = '(o: ) Tj' | |
exp_name = '(ente: ) Tj' | |
match_field = 'Tj' | |
record_allowed = "0123456789/" | |
def grab_field(line): | |
return line[line.find("(") + 1:line.rfind(")")] | |
def pad_field(line, num): | |
remains = ' ' * (num - len(line)) | |
return line + remains | |
def is_a_record(record): | |
return all(c in record_allowed for c in record) | |
def is_delimiter_enter_reached(match_str): | |
for match_try in exp_record_start: | |
if match_try in match_str: | |
return True | |
return False | |
def main(): | |
# Get list of files. | |
flist = [] | |
for p in pathlib.Path(uncompressed_pdf_path).iterdir(): | |
if p.is_file(): | |
flist.append(p) | |
# Read files. | |
for f in flist: | |
name = '' | |
data = '' | |
with open(f, 'rb') as file: | |
is_name_read = False | |
is_record_read = False | |
is_record_next = False | |
is_name_void = False | |
is_record_void = False | |
for newline in file: | |
if is_record_void and is_name_void: | |
break | |
line = newline.decode(errors='ignore') | |
if not is_name_void: | |
if not is_name_read and exp_name in line: | |
is_name_read = True | |
continue | |
elif is_name_read and match_field in line: | |
name = grab_field(line) | |
is_name_void = True | |
if not is_record_void: | |
if exp_record_start in line: | |
is_record_read = True | |
continue | |
elif not is_record_read: | |
continue | |
elif exp_record_last in line: | |
is_record_next = True | |
continue | |
elif is_record_next and match_field in line: | |
data = grab_field(line) | |
data = '' if data == 'Mdi' else data | |
is_record_void = True | |
# Now name contains the patient name, and data contains the record. | |
formatted = data if is_a_record(data) else '* (' + data + ')' | |
print(pad_field(formatted, 15) + '= [' + name + ']') | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment