const PAN_REGEX = {
'govt' : /GOVT|INDIA/,
'income_tax' : /INCOME|TAX/,
'fathers_name_heading' : /Father/,
'dob_heading' : /Date|Birth/,
'date_format': /(\d{2}\/\d{2}\/\d{4})/,
'number_heading' : /Permanent|Account|Number/,
'number_format': /[A-Z0-9]{10,}/,
'words_format': /^[A-Z]+[A-Z\s.]+$/
};
All text which doesn't match the defined fields format for the document. For instance other than the date of birth, no other text in a PAN Card should have special characters.
const filterNoiseFromLine = (lineText: string) => {
const panRegexKeys = _.keys(PAN_REGEX);
const spaceSplit = lineText.split(/\s/);
const filteredSpacedList = _.filter(spaceSplit, (word) => {
return _.some(panRegexKeys, (key) => {
return PAN_REGEX[key].exec(word);
});
});
return _.join(filteredSpacedList, ' ');
};
Find the line numbers which match the field headings regex for the document.
const parsePANHeadingLineNumbers = (lines: Array<string>) => {
const panHeadingLineNumbers = {
pan_number_text_line: undefined,
pan_IT_text_line: undefined,
pan_GOVT_text_line: undefined,
pan_DOB_text_line: undefined,
pan_fathers_name_text_line: undefined
};
_.forEach(lines, (line, index) => {
if (!panHeadingLineNumbers['pan_number_text_line'] && PAN_REGEX['number_heading'].exec(line)) {
panHeadingLineNumbers['pan_number_text_line'] = index;
}
else if (!panHeadingLineNumbers['pan_IT_text_line'] && PAN_REGEX['income_tax'].exec(line)) {
panHeadingLineNumbers['pan_IT_text_line'] = index;
}
else if (!panHeadingLineNumbers['pan_GOVT_text_line'] && PAN_REGEX['govt'].exec(line)) {
panHeadingLineNumbers['pan_GOVT_text_line'] = index;
}
else if (!panHeadingLineNumbers['pan_fathers_name_text_line'] && PAN_REGEX['fathers_name_heading'].exec(line)) {
panHeadingLineNumbers['pan_fathers_name_text_line'] = index;
}
else if (!panHeadingLineNumbers['pan_DOB_text_line'] && PAN_REGEX['dob_heading'].exec(line)) {
panHeadingLineNumbers['pan_DOB_text_line'] = index;
}
});
return panHeadingLineNumbers;
};
This step includes processing of text in field values based on line numbers of field headings.
- The values for fields which don't have heading can be determined relative to the line numbers of other field headings.
const parsePANText = (textLines: Array<string>,
panHeadingLineNumbers: Record<string, any>) => {
const parsedResult: any = {
document_type: Constants.DOCUMENT_TYPES.PAN_CARD
};
const panNumberLine = panHeadingLineNumbers['pan_number_text_line'] + 1;
const panDOBLine = panHeadingLineNumbers['pan_DOB_text_line'] + 1;
const panFatherNameLine = panHeadingLineNumbers['pan_fathers_name_text_line'] + 1;
const panNameLine = panHeadingLineNumbers['pan_fathers_name_text_line'] - 1;
parsedResult.identification_number = processPANNumber(textLines[panNumberLine]);
parsedResult.name = processPANName(textLines[panNameLine]);
parsedResult.date_of_birth = processPANDateOfBirth(textLines[panDOBLine]);
parsedResult.fathers_name = processPANFathersName(textLines[panFatherNameLine]);
return parsedResult;
};
- Any field value which doesn't match the expected field regex can be rejected.
const processPANDateOfBirth = (text: string) => {
if (_.isEmpty(text)) {
return undefined;
}
if (!PAN_REGEX['date_format'].exec(text)) {
return undefined;
}
return moment.utc(text, 'DD/MM/YYYY');
};
- (Optionally) Use the known format for the field to tackle the common OCR errors. For instance first 5 characters of PAN number are upper-case english alphabets, followed by 4 digits and 1 letter to end.
const processPANNumber = (text: string) => {
if (_.isEmpty(text)) {
return undefined;
}
if (!PAN_REGEX['number_format'].exec(text)) {
return undefined;
}
const preFix = text.substr(0, 5);
const numbers = text.substr(5, 4);
const suffix = text[9];
const preFixProcessed = preFix.replace('0', 'O')
.replace('8', 'B').replace('5', 'S')
.replace('1', 'I');
const numbersProcessed = numbers.replace('O', '0')
.replace('D', '0').replace('B', '8')
.replace('S', '5').replace('I', '1')
.replace('!', '1');
const suffixProcessed = suffix.replace('0', 'O')
.replace('8', 'B').replace('5', 'S')
.replace('1', 'I');
return preFixProcessed.concat(numbersProcessed).concat(suffixProcessed);
};