Skip to content

Instantly share code, notes, and snippets.

@mrtnzlml
Last active January 6, 2025 10:46
Show Gist options
  • Save mrtnzlml/1c7470e48acb840b4d6fdb6af2fcb2c9 to your computer and use it in GitHub Desktop.
Save mrtnzlml/1c7470e48acb840b4d6fdb6af2fcb2c9 to your computer and use it in GitHub Desktop.
Corrects incorrectly read dates by Rossum based on a specified pattern and vendor ID. Use this when you know that certain vendors use ambiguous date formats but you already know the date structure in advance.
{
"patterns": [
{
"//": "Vendor, UAB (dates '24.11.30' read as '2030-11-24' but should be '2024-11-30')",
"condition_field_id": "company_match__RegistrationNumber",
"condition_field_value": "123456789",
"source_field_id": "date_issue",
"source_field_format": "YY.MM.DD",
"target_field_id": "date_issue_normalized"
},
{
"//": "Vendor, UAB (dates '24.11.30' read as '2030-11-24' but should be '2024-11-30')",
"condition_field_id": "company_match__RegistrationNumber",
"condition_field_value": "122596696",
"source_field_id": "date_due",
"source_field_format": "YY.MM.DD",
"target_field_id": "date_due_normalized"
}
{
"//": "Vendor, LLC (dates '5/1/2024' read as '2024-01-05' but should be '2024-05-01')",
"condition_field_id": "company_match__RegistrationNumber",
"condition_field_value": "987654321",
"source_field_id": "date_issue",
"source_field_format": "M/D/YYYY",
"target_field_id": "date_issue_normalized"
}
]
}
from datetime import datetime
from txscript import TxScript, is_empty
def rossum_hook_request_handler(payload):
"""
Analyzes the *date* value from the `source_field_id` datapoint and output the corrected value to the
`target_field_id` datapoint.
The analysis is made based on the pattern specified in `source_field_format` configuration. In case the date value
read by Rossum is conforming to the expected date format, the date value is simply copied. If not, the date value
is corrected according to the specified pattern.
Configuration example:
```
{
"patterns": [
{
"//": "Vendor, LLC (dates '24.11.30' read as '2030-11-24' but should be '2024-11-30')",
"condition_field_id": "company_match__RegistrationNumber",
"condition_field_value": "123456789",
"source_field_id": "date_issue",
"source_field_format": "YY.MM.DD",
"target_field_id": "date_issue_normalized"
}
]
}
```
"""
t = TxScript.from_payload(payload)
pattern_configs = payload.get("settings", {}).get("patterns", [])
for pattern_config in pattern_configs:
condition_field_id = pattern_config["condition_field_id"]
condition_field_value = pattern_config["condition_field_value"]
source_field_id = pattern_config.get("source_field_id")
source_field_format = pattern_config.get("source_field_format", "YYYY-MM-DD")
source_field_value = getattr(t.field, source_field_id)
target_field_id = pattern_config.get("target_field_id")
if getattr(t.field, target_field_id).id in payload.get("updated_datapoints", []):
# Allow manual updates to the target field (source field will always overwrite the target).
break
if getattr(t.field, condition_field_id) == condition_field_value:
target_field_value = process_date(source_field_value, source_field_format)
setattr(t.field, target_field_id, target_field_value)
else:
target_field_value = source_field_value
setattr(t.field, target_field_id, target_field_value)
return t.hook_response()
def process_date(date_value, mask):
if is_empty(date_value):
return date_value
# ai_day, ai_month, ai_year = date_value.day, date_value.month, date_value.year
ocr_text = date_value.attr.ocr_raw_text or date_value.attr.rir_raw_text
try:
result = split_date(ocr_text, mask)
print(f"Correcting input date: '{ocr_text}' with mask '{mask}' to {result}")
return datetime(year=result["year"], month=result["month"], day=result["day"])
except ValueError:
return date_value
def split_date(date_string, mask):
mask_mapping = {'Y': 'year', 'M': 'month', 'D': 'day'}
# Prepare for extraction by splitting the mask and the date string
# TODO: better validate/handle the masks (they are a bit loosely defined now)
separators = ''.join(set(c for c in mask if not c.isalnum()))
for sep in separators:
mask = mask.replace(sep, ' ')
date_string = date_string.replace(sep, ' ')
mask_parts = mask.split()
date_parts = date_string.split()
# Verify that the mask matches the date string (loosely)
if len(mask_parts) != len(date_parts):
raise ValueError(f"Mask '{mask}' does not match the date string '{date_string}'")
# Map date parts to their corresponding components
components = {}
for mask_part, date_part in zip(mask_parts, date_parts):
if mask_part[0] in mask_mapping:
components[mask_mapping[mask_part[0]]] = int(date_part)
# Handle two-digit years by expanding to the 2000s
if 'year' in components and len(str(components['year'])) == 2:
components['year'] += 2000
return components
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment