davidcesarino/hulw-pdf-labs-extractor.py

## hulw-pdf-labs-extractor.py
# coding=utf-8

# Copyright 2020 David Cesarino de Sousa
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import pathlib

# Path for uncompressed PDF files. You can generate the uncompressed files with the following bash script:
#
# #!/bin/bash
# for f in ./*.PDF; do
# 	file=$(echo $f | cut -c 3-8)-deflated.pdf
# 	qpdf --stream-data=uncompress $f ./uncompressed/$file
# done
#
# Steps:
# 1. Store your retrieved PDFs from Lisweb in your work directory. Let's call it WORKING_DIRECTORY.
# 2. Place the bash script in WORKING_DIRECTORY.
# 3. Run the bash script.
# 4. Replace 'WORKING_DIRECTORY' in the variable below with the full path of your real working directory.
# 5. Run this Python script.
uncompressed_pdf_path = 'WORKING_DIRECTORY/uncompressed'

# DO NOT EDIT ANYTHING BELOW THIS LINE UNLESS YOU KNOW WHAT YOU ARE DOING.
exp_record_start = '(Prontu'
exp_record_last = '(o: ) Tj'
exp_name = '(ente: ) Tj'
match_field = 'Tj'
record_allowed = "0123456789/"


def grab_field(line):
    return line[line.find("(") + 1:line.rfind(")")]


def pad_field(line, num):
    remains = ' ' * (num - len(line))
    return line + remains


def is_a_record(record):
    return all(c in record_allowed for c in record)


def is_delimiter_enter_reached(match_str):
    for match_try in exp_record_start:
        if match_try in match_str:
            return True
    return False


def main():
    # Get list of files.
    flist = []
    for p in pathlib.Path(uncompressed_pdf_path).iterdir():
        if p.is_file():
            flist.append(p)

    # Read files.
    for f in flist:
        name = ''
        data = ''

        with open(f, 'rb') as file:
            is_name_read = False
            is_record_read = False
            is_record_next = False

            is_name_void = False
            is_record_void = False

            for newline in file:
                if is_record_void and is_name_void:
                    break
                line = newline.decode(errors='ignore')

                if not is_name_void:
                    if not is_name_read and exp_name in line:
                        is_name_read = True
                        continue
                    elif is_name_read and match_field in line:
                        name = grab_field(line)
                        is_name_void = True

                if not is_record_void:
                    if exp_record_start in line:
                        is_record_read = True
                        continue
                    elif not is_record_read:
                        continue
                    elif exp_record_last in line:
                        is_record_next = True
                        continue
                    elif is_record_next and match_field in line:
                        data = grab_field(line)
                        data = '' if data == 'Mdi' else data
                        is_record_void = True

            # Now name contains the patient name, and data contains the record.
            formatted = data if is_a_record(data) else '* (' + data + ')'
            print(pad_field(formatted, 15) + '= [' + name + ']')


main()
	# coding=utf-8

	# Copyright 2020 David Cesarino de Sousa
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	import pathlib

	# Path for uncompressed PDF files. You can generate the uncompressed files with the following bash script:
	#
	# #!/bin/bash
	# for f in ./*.PDF; do
	# file=$(echo $f \| cut -c 3-8)-deflated.pdf
	# qpdf --stream-data=uncompress $f ./uncompressed/$file
	# done
	#
	# Steps:
	# 1. Store your retrieved PDFs from Lisweb in your work directory. Let's call it WORKING_DIRECTORY.
	# 2. Place the bash script in WORKING_DIRECTORY.
	# 3. Run the bash script.
	# 4. Replace 'WORKING_DIRECTORY' in the variable below with the full path of your real working directory.
	# 5. Run this Python script.
	uncompressed_pdf_path = 'WORKING_DIRECTORY/uncompressed'

	# DO NOT EDIT ANYTHING BELOW THIS LINE UNLESS YOU KNOW WHAT YOU ARE DOING.
	exp_record_start = '(Prontu'
	exp_record_last = '(o: ) Tj'
	exp_name = '(ente: ) Tj'
	match_field = 'Tj'
	record_allowed = "0123456789/"


	def grab_field(line):
	return line[line.find("(") + 1:line.rfind(")")]


	def pad_field(line, num):
	remains = ' ' * (num - len(line))
	return line + remains


	def is_a_record(record):
	return all(c in record_allowed for c in record)


	def is_delimiter_enter_reached(match_str):
	for match_try in exp_record_start:
	if match_try in match_str:
	return True
	return False


	def main():
	# Get list of files.
	flist = []
	for p in pathlib.Path(uncompressed_pdf_path).iterdir():
	if p.is_file():
	flist.append(p)

	# Read files.
	for f in flist:
	name = ''
	data = ''

	with open(f, 'rb') as file:
	is_name_read = False
	is_record_read = False
	is_record_next = False

	is_name_void = False
	is_record_void = False

	for newline in file:
	if is_record_void and is_name_void:
	break
	line = newline.decode(errors='ignore')

	if not is_name_void:
	if not is_name_read and exp_name in line:
	is_name_read = True
	continue
	elif is_name_read and match_field in line:
	name = grab_field(line)
	is_name_void = True

	if not is_record_void:
	if exp_record_start in line:
	is_record_read = True
	continue
	elif not is_record_read:
	continue
	elif exp_record_last in line:
	is_record_next = True
	continue
	elif is_record_next and match_field in line:
	data = grab_field(line)
	data = '' if data == 'Mdi' else data
	is_record_void = True

	# Now name contains the patient name, and data contains the record.
	formatted = data if is_a_record(data) else '* (' + data + ')'
	print(pad_field(formatted, 15) + '= [' + name + ']')


	main()