Last active
January 6, 2025 10:46
-
-
Save mrtnzlml/1c7470e48acb840b4d6fdb6af2fcb2c9 to your computer and use it in GitHub Desktop.
Corrects incorrectly read dates by Rossum based on a specified pattern and vendor ID. Use this when you know that certain vendors use ambiguous date formats but you already know the date structure in advance.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"patterns": [ | |
{ | |
"//": "Vendor, UAB (dates '24.11.30' read as '2030-11-24' but should be '2024-11-30')", | |
"condition_field_id": "company_match__RegistrationNumber", | |
"condition_field_value": "123456789", | |
"source_field_id": "date_issue", | |
"source_field_format": "YY.MM.DD", | |
"target_field_id": "date_issue_normalized" | |
}, | |
{ | |
"//": "Vendor, UAB (dates '24.11.30' read as '2030-11-24' but should be '2024-11-30')", | |
"condition_field_id": "company_match__RegistrationNumber", | |
"condition_field_value": "122596696", | |
"source_field_id": "date_due", | |
"source_field_format": "YY.MM.DD", | |
"target_field_id": "date_due_normalized" | |
} | |
{ | |
"//": "Vendor, LLC (dates '5/1/2024' read as '2024-01-05' but should be '2024-05-01')", | |
"condition_field_id": "company_match__RegistrationNumber", | |
"condition_field_value": "987654321", | |
"source_field_id": "date_issue", | |
"source_field_format": "M/D/YYYY", | |
"target_field_id": "date_issue_normalized" | |
} | |
] | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from datetime import datetime | |
from txscript import TxScript, is_empty | |
def rossum_hook_request_handler(payload): | |
""" | |
Analyzes the *date* value from the `source_field_id` datapoint and output the corrected value to the | |
`target_field_id` datapoint. | |
The analysis is made based on the pattern specified in `source_field_format` configuration. In case the date value | |
read by Rossum is conforming to the expected date format, the date value is simply copied. If not, the date value | |
is corrected according to the specified pattern. | |
Configuration example: | |
``` | |
{ | |
"patterns": [ | |
{ | |
"//": "Vendor, LLC (dates '24.11.30' read as '2030-11-24' but should be '2024-11-30')", | |
"condition_field_id": "company_match__RegistrationNumber", | |
"condition_field_value": "123456789", | |
"source_field_id": "date_issue", | |
"source_field_format": "YY.MM.DD", | |
"target_field_id": "date_issue_normalized" | |
} | |
] | |
} | |
``` | |
""" | |
t = TxScript.from_payload(payload) | |
pattern_configs = payload.get("settings", {}).get("patterns", []) | |
for pattern_config in pattern_configs: | |
condition_field_id = pattern_config["condition_field_id"] | |
condition_field_value = pattern_config["condition_field_value"] | |
source_field_id = pattern_config.get("source_field_id") | |
source_field_format = pattern_config.get("source_field_format", "YYYY-MM-DD") | |
source_field_value = getattr(t.field, source_field_id) | |
target_field_id = pattern_config.get("target_field_id") | |
if getattr(t.field, target_field_id).id in payload.get("updated_datapoints", []): | |
# Allow manual updates to the target field (source field will always overwrite the target). | |
break | |
if getattr(t.field, condition_field_id) == condition_field_value: | |
target_field_value = process_date(source_field_value, source_field_format) | |
setattr(t.field, target_field_id, target_field_value) | |
else: | |
target_field_value = source_field_value | |
setattr(t.field, target_field_id, target_field_value) | |
return t.hook_response() | |
def process_date(date_value, mask): | |
if is_empty(date_value): | |
return date_value | |
# ai_day, ai_month, ai_year = date_value.day, date_value.month, date_value.year | |
ocr_text = date_value.attr.ocr_raw_text or date_value.attr.rir_raw_text | |
try: | |
result = split_date(ocr_text, mask) | |
print(f"Correcting input date: '{ocr_text}' with mask '{mask}' to {result}") | |
return datetime(year=result["year"], month=result["month"], day=result["day"]) | |
except ValueError: | |
return date_value | |
def split_date(date_string, mask): | |
mask_mapping = {'Y': 'year', 'M': 'month', 'D': 'day'} | |
# Prepare for extraction by splitting the mask and the date string | |
# TODO: better validate/handle the masks (they are a bit loosely defined now) | |
separators = ''.join(set(c for c in mask if not c.isalnum())) | |
for sep in separators: | |
mask = mask.replace(sep, ' ') | |
date_string = date_string.replace(sep, ' ') | |
mask_parts = mask.split() | |
date_parts = date_string.split() | |
# Verify that the mask matches the date string (loosely) | |
if len(mask_parts) != len(date_parts): | |
raise ValueError(f"Mask '{mask}' does not match the date string '{date_string}'") | |
# Map date parts to their corresponding components | |
components = {} | |
for mask_part, date_part in zip(mask_parts, date_parts): | |
if mask_part[0] in mask_mapping: | |
components[mask_mapping[mask_part[0]]] = int(date_part) | |
# Handle two-digit years by expanding to the 2000s | |
if 'year' in components and len(str(components['year'])) == 2: | |
components['year'] += 2000 | |
return components |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment